pabsw
__m128i _mm_abs_epi16 (__m128i a)
Synopsis
__m128i _mm_abs_epi16 (__m128i a)
#include "tmmintrin.h"
Instruction: pabsw xmm, xmm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
Performance
vpabsw
__m128i _mm_mask_abs_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpabsw
__m128i _mm_maskz_abs_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpabsw
__m256i _mm256_abs_epi16 (__m256i a)
Synopsis
__m256i _mm256_abs_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpabsw ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpabsw
__m256i _mm256_mask_abs_epi16 (__m256i src, __mmask16 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi16 (__m256i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpabsw
__m256i _mm256_maskz_abs_epi16 (__mmask16 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi16 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpabsw
__m512i _mm512_abs_epi16 (__m512i a)
Synopsis
__m512i _mm512_abs_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:512] := 0
vpabsw
__m512i _mm512_mask_abs_epi16 (__m512i src, __mmask32 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi16 (__m512i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpabsw
__m512i _mm512_maskz_abs_epi16 (__mmask32 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi16 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsw
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ABS(a[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pabsd
__m128i _mm_abs_epi32 (__m128i a)
Synopsis
__m128i _mm_abs_epi32 (__m128i a)
#include "tmmintrin.h"
Instruction: pabsd xmm, xmm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
Performance
vpabsd
__m128i _mm_mask_abs_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpabsd
__m128i _mm_maskz_abs_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpabsd
__m256i _mm256_abs_epi32 (__m256i a)
Synopsis
__m256i _mm256_abs_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpabsd ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpabsd
__m256i _mm256_mask_abs_epi32 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpabsd
__m256i _mm256_maskz_abs_epi32 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpabsd
__m512i _mm512_abs_epi32 (__m512i a)
Synopsis
__m512i _mm512_abs_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpabsd
__m512i _mm512_mask_abs_epi32 (__m512i src, __mmask16 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpabsd
__m512i _mm512_maskz_abs_epi32 (__mmask16 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ABS(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpabsq
__m128i _mm_abs_epi64 (__m128i a)
Synopsis
__m128i _mm_abs_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpabsq
__m128i _mm_mask_abs_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpabsq
__m128i _mm_maskz_abs_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpabsq
__m256i _mm256_abs_epi64 (__m256i a)
Synopsis
__m256i _mm256_abs_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vpabsq
__m256i _mm256_mask_abs_epi64 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpabsq
__m256i _mm256_maskz_abs_epi64 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpabsq
__m512i _mm512_abs_epi64 (__m512i a)
Synopsis
__m512i _mm512_abs_epi64 (__m512i a)
#include "immintrin.h"
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ABS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vpabsq
__m512i _mm512_mask_abs_epi64 (__m512i src, __mmask8 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpabsq
__m512i _mm512_maskz_abs_epi64 (__mmask8 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the absolute value of packed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ABS(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pabsb
__m128i _mm_abs_epi8 (__m128i a)
Synopsis
__m128i _mm_abs_epi8 (__m128i a)
#include "tmmintrin.h"
Instruction: pabsb xmm, xmm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
Performance
vpabsb
__m128i _mm_mask_abs_epi8 (__m128i src, __mmask16 k, __m128i a)
Synopsis
__m128i _mm_mask_abs_epi8 (__m128i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpabsb
__m128i _mm_maskz_abs_epi8 (__mmask16 k, __m128i a)
Synopsis
__m128i _mm_maskz_abs_epi8 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpabsb
__m256i _mm256_abs_epi8 (__m256i a)
Synopsis
__m256i _mm256_abs_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpabsb ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpabsb
__m256i _mm256_mask_abs_epi8 (__m256i src, __mmask32 k, __m256i a)
Synopsis
__m256i _mm256_mask_abs_epi8 (__m256i src, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpabsb
__m256i _mm256_maskz_abs_epi8 (__mmask32 k, __m256i a)
Synopsis
__m256i _mm256_maskz_abs_epi8 (__mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpabsb
__m512i _mm512_abs_epi8 (__m512i a)
Synopsis
__m512i _mm512_abs_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:512] := 0
vpabsb
__m512i _mm512_mask_abs_epi8 (__m512i src, __mmask64 k, __m512i a)
Synopsis
__m512i _mm512_mask_abs_epi8 (__m512i src, __mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpabsb
__m512i _mm512_maskz_abs_epi8 (__mmask64 k, __m512i a)
Synopsis
__m512i _mm512_maskz_abs_epi8 (__mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vpabsb
CPUID Flags: AVX512BW
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := ABS(a[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpandq
__m512d _mm512_abs_pd (__m512d v2)
Synopsis
__m512d _mm512_abs_pd (__m512d v2)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ABS(v2[i+63:i])
ENDFOR
dst[MAX:512] := 0
vpandq
__m512d _mm512_mask_abs_pd (__m512d src, __mmask8 k, __m512d v2)
Synopsis
__m512d _mm512_mask_abs_pd (__m512d src, __mmask8 k, __m512d v2)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ABS(v2[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
pabsw
__m64 _mm_abs_pi16 (__m64 a)
Synopsis
__m64 _mm_abs_pi16 (__m64 a)
#include "tmmintrin.h"
Instruction: pabsw mm, mm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 16-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 3
i := j*16
dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
Performance
pabsd
__m64 _mm_abs_pi32 (__m64 a)
Synopsis
__m64 _mm_abs_pi32 (__m64 a)
#include "tmmintrin.h"
Instruction: pabsd mm, mm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 32-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 1
i := j*32
dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
Performance
pabsb
__m64 _mm_abs_pi8 (__m64 a)
Synopsis
__m64 _mm_abs_pi8 (__m64 a)
#include "tmmintrin.h"
Instruction: pabsb mm, mm
CPUID Flags: SSSE3
Description
Compute the absolute value of packed 8-bit integers in a, and store the unsigned results in dst.
Operation
FOR j := 0 to 7
i := j*8
dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
Performance
vpandd
__m512 _mm512_abs_ps (__m512 v2)
Synopsis
__m512 _mm512_abs_ps (__m512 v2)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ABS(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpandd
__m512 _mm512_mask_abs_ps (__m512 src, __mmask16 k, __m512 v2)
Synopsis
__m512 _mm512_mask_abs_ps (__m512 src, __mmask16 k, __m512 v2)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ABS(v2[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_acos_pd (__m128d a)
Synopsis
__m128d _mm_acos_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_acos_pd (__m256d a)
Synopsis
__m256d _mm256_acos_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_acos_pd (__m512d a)
Synopsis
__m512d _mm512_acos_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_acos_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_acos_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ACOS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_acos_ps (__m128 a)
Synopsis
__m128 _mm_acos_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_acos_ps (__m256 a)
Synopsis
__m256 _mm256_acos_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_acos_ps (__m512 a)
Synopsis
__m512 _mm512_acos_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_acos_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_acos_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ACOS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_acosh_pd (__m128d a)
Synopsis
__m128d _mm_acosh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_acosh_pd (__m256d a)
Synopsis
__m256d _mm256_acosh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_acosh_pd (__m512d a)
Synopsis
__m512d _mm512_acosh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_acosh_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_acosh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ACOSH(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_acosh_ps (__m128 a)
Synopsis
__m128 _mm_acosh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_acosh_ps (__m256 a)
Synopsis
__m256 _mm256_acosh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_acosh_ps (__m512 a)
Synopsis
__m512 _mm512_acosh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_acosh_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_acosh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ACOSH(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpadcd
__m512i _mm512_adc_epi32 (__m512i v2, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
Synopsis
__m512i _mm512_adc_epi32 (__m512i v2, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
#include "immintrin.h"
Instruction: vpadcd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition of packed 32-bit integers in v2 and v3 and the corresponding bit in k2, storing the result of the addition in dst and the result of the carry in k2_res.
Operation
FOR j := 0 to 15
i := j*32
k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
ENDFOR
dst[MAX:512] := 0
vpadcd
__m512i _mm512_mask_adc_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
Synopsis
__m512i _mm512_mask_adc_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * k2_res)
#include "immintrin.h"
Instruction: vpadcd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition of packed 32-bit integers in v2 and v3 and the corresponding bit in k2, storing the result of the addition in dst and the result of the carry in k2_res using writemask k1 (elements are copied from v2 when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i] + k2[j])
dst[i+31:i] := v2[i+31:i] + v3[i+31:i] + k2[j]
ELSE
dst[i+31:i] := v2[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
paddw
__m128i _mm_add_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_add_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddw xmm, xmm
CPUID Flags: SSE2
Description
Add packed 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
Performance
vpaddw
__m128i _mm_mask_add_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_add_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddw
__m128i _mm_maskz_add_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_add_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddw
__m256i _mm256_add_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_add_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpaddw
__m256i _mm256_mask_add_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_add_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddw
__m256i _mm256_maskz_add_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_add_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddw
__m512i _mm512_add_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_add_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
dst[MAX:512] := 0
vpaddw
__m512i _mm512_mask_add_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_add_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddw
__m512i _mm512_maskz_add_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_add_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] + b[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddd
__m128i _mm_add_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_add_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddd xmm, xmm
CPUID Flags: SSE2
Description
Add packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
Performance
vpaddd
__m128i _mm_mask_add_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_add_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddd
__m128i _mm_maskz_add_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_add_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddd
__m256i _mm256_add_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_add_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpaddd
__m256i _mm256_mask_add_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_add_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddd
__m256i _mm256_maskz_add_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_add_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddd
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddd
__m512i _mm512_add_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_add_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vpaddd
__m512i _mm512_mask_add_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_add_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddd
__m512i _mm512_maskz_add_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_add_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddq
__m128i _mm_add_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_add_epi64 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddq xmm, xmm
CPUID Flags: SSE2
Description
Add packed 64-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
Performance
vpaddq
__m128i _mm_mask_add_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_add_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddq
__m128i _mm_maskz_add_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_add_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddq
__m256i _mm256_add_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_add_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 64-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpaddq
__m256i _mm256_mask_add_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_add_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddq
__m256i _mm256_maskz_add_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_add_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddq
CPUID Flags: AVX512VL + AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] :=0
FI
ENDFOR
dst[MAX:256] := 0
vpaddq
__m512i _mm512_add_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_add_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vpaddq
__m512i _mm512_mask_add_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_add_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddq
__m512i _mm512_maskz_add_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_add_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddb
__m128i _mm_add_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_add_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddb xmm, xmm
CPUID Flags: SSE2
Description
Add packed 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
Performance
vpaddb
__m128i _mm_mask_add_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_add_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddb
__m128i _mm_maskz_add_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_add_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddb
__m256i _mm256_add_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_add_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpaddb
__m256i _mm256_mask_add_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_add_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddb
__m256i _mm256_maskz_add_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_add_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddb
__m512i _mm512_add_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_add_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
dst[MAX:512] := 0
vpaddb
__m512i _mm512_mask_add_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_add_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddb
__m512i _mm512_maskz_add_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_add_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] + b[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
addpd
__m128d _mm_add_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_add_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: addpd xmm, xmm
CPUID Flags: SSE2
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
Performance
vaddpd
__m128d _mm_mask_add_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_add_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vaddpd
__m128d _mm_maskz_add_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_add_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vaddpd
__m256d _mm256_add_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_add_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vaddpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vaddpd
__m256d _mm256_mask_add_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_add_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vaddpd
__m256d _mm256_maskz_add_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_add_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vaddpd
CPUID Flags: AVX512F + AVX512VL
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vaddpd
__m512d _mm512_add_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_add_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_mask_add_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_add_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_maskz_add_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_add_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
addps
__m128 _mm_add_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_add_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: addps xmm, xmm
CPUID Flags: SSE
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
Performance
vaddps
__m128 _mm_mask_add_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_add_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vaddps
__m128 _mm_maskz_add_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_add_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vaddps
__m256 _mm256_add_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_add_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vaddps ymm, ymm, ymm
CPUID Flags: AVX
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vaddps
__m256 _mm256_mask_add_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_add_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vaddps
__m256 _mm256_maskz_add_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_add_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vaddps
CPUID Flags: AVX512F + AVX512VL
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vaddps
__m512 _mm512_add_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_add_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_mask_add_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_add_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_maskz_add_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_add_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_add_round_pd (__m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_add_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed double-precision (64-bit) floating-point elements in
a and
b, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_mask_add_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_mask_add_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed double-precision (64-bit) floating-point elements in
a and
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddpd
__m512d _mm512_maskz_add_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_maskz_add_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vaddpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Add packed double-precision (64-bit) floating-point elements in
a and
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] + b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_add_round_ps (__m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_add_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed single-precision (32-bit) floating-point elements in
a and
b, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_mask_add_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_mask_add_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Add packed single-precision (32-bit) floating-point elements in
a and
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddps
__m512 _mm512_maskz_add_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_maskz_add_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vaddps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Add packed single-precision (32-bit) floating-point elements in
a and
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] + b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vaddsd
__m128d _mm_add_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_add_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in
a and
b, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := a[63:0] + b[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vaddsd
__m128d _mm_mask_add_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_add_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in
a and
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] + b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vaddsd
__m128d _mm_maskz_add_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_add_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in
a and
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] + b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vaddss
__m128 _mm_add_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_add_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in
a and
b, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := a[31:0] + b[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vaddss
__m128 _mm_mask_add_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_add_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in
a and
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] + b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vaddss
__m128 _mm_maskz_add_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_add_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in
a and
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] + b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
addsd
__m128d _mm_add_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_add_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: addsd xmm, xmm
CPUID Flags: SSE2
Description
Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := a[63:0] + b[63:0]
dst[127:64] := a[127:64]
Performance
vaddsd
__m128d _mm_mask_add_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_add_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] + b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vaddsd
__m128d _mm_maskz_add_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_add_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vaddsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] + b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
paddq
__m64 _mm_add_si64 (__m64 a, __m64 b)
Synopsis
__m64 _mm_add_si64 (__m64 a, __m64 b)
#include "emmintrin.h"
Instruction: paddq mm, mm
CPUID Flags: SSE2
Description
Add 64-bit integers a and b, and store the result in dst.
Operation
dst[63:0] := a[63:0] + b[63:0]
Performance
addss
__m128 _mm_add_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_add_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: addss xmm, xmm
CPUID Flags: SSE
Description
Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := a[31:0] + b[31:0]
dst[127:32] := a[127:32]
Performance
vaddss
__m128 _mm_mask_add_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_add_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] + b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vaddss
__m128 _mm_maskz_add_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_add_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vaddss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] + b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
adc
unsigned char _addcarry_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
Synopsis
unsigned char _addcarry_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
#include "immintrin.h"
Instruction: adc r32, r32
Description
Add unsigned 32-bit integers a and b with unsigned 8-bit carry-in c_in (carry flag), and store the unsigned 32-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
dst:out[31:0] := a[31:0] + b[31:0] + c_in;
adc
unsigned char _addcarry_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
Synopsis
unsigned char _addcarry_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
#include "immintrin.h"
Instruction: adc r64, r64
Description
Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in (carry flag), and store the unsigned 64-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
dst:out[63:0] := a[63:0] + b[63:0] + c_in;
adcx, adox
unsigned char _addcarryx_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
Synopsis
unsigned char _addcarryx_u32 (unsigned char c_in, unsigned int a, unsigned int b, unsigned int * out)
#include "immintrin.h"
Instruction: adcx r32, r32
adox r32, r32
CPUID Flags: ADX
Description
Add unsigned 32-bit integers a and b with unsigned 8-bit carry-in c_in (carry or overflow flag), and store the unsigned 32-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
dst:out[31:0] := a[31:0] + b[31:0] + c_in;
adcx, adox
unsigned char _addcarryx_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
Synopsis
unsigned char _addcarryx_u64 (unsigned char c_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
#include "immintrin.h"
Instruction: adcx r64, r64
adox r64, r64
CPUID Flags: ADX
Description
Add unsigned 64-bit integers a and b with unsigned 8-bit carry-in c_in (carry or overflow flag), and store the unsigned 64-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
dst:out[63:0] := a[63:0] + b[63:0] + c_in;
vaddnpd
__m512d _mm512_addn_pd (__m512d v2, __m512d v3)
Synopsis
__m512d _mm512_addn_pd (__m512d v2, __m512d v3)
#include "immintrin.h"
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
ENDFOR
dst[MAX:512] := 0
vaddnpd
__m512d _mm512_mask_addn_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)
Synopsis
__m512d _mm512_mask_addn_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)
#include "immintrin.h"
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition between packed double-precision (64-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddnps
__m512 _mm512_addn_ps (__m512 v2, __m512 v3)
Synopsis
__m512 _mm512_addn_ps (__m512 v2, __m512 v3)
#include "immintrin.h"
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
ENDFOR
dst[MAX:512] := 0
vaddnps
__m512 _mm512_mask_addn_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)
Synopsis
__m512 _mm512_mask_addn_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)
#include "immintrin.h"
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition between packed single-precision (32-bit) floating-point elements in v2 and v3 and negates their sum, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddnpd
__m512d _mm512_addn_round_pd (__m512d v2, __m512d v3, int rounding)
Synopsis
__m512d _mm512_addn_round_pd (__m512d v2, __m512d v3, int rounding)
#include "immintrin.h"
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element by element addition between packed double-precision (64-bit) floating-point elements in
v2 and
v3 and negates the sum, storing the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
ENDFOR
dst[MAX:512] := 0
vaddnpd
__m512d _mm512_mask_addn_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)
Synopsis
__m512d _mm512_mask_addn_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)
#include "immintrin.h"
Instruction: vaddnpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element by element addition between packed double-precision (64-bit) floating-point elements in
v2 and
v3 and negates the sum, storing the result in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(v2[i+63:i] + v3[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddnps
__m512 _mm512_addn_round_ps (__m512 v2, __m512 v3, int rounding)
Synopsis
__m512 _mm512_addn_round_ps (__m512 v2, __m512 v3, int rounding)
#include "immintrin.h"
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element by element addition between packed single-precision (32-bit) floating-point elements in
v2 and
v3 and negates the sum, storing the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
ENDFOR
dst[MAX:512] := 0
vaddnps
__m512 _mm512_mask_addn_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)
Synopsis
__m512 _mm512_mask_addn_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)
#include "immintrin.h"
Instruction: vaddnps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element by element addition between packed single-precision (32-bit) floating-point elements in
v2 and
v3 and negates the sum, storing the result in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(v2[i+31:i] + v3[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
paddsw
__m128i _mm_adds_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_adds_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddsw xmm, xmm
CPUID Flags: SSE2
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
Performance
vpaddsw
__m128i _mm_mask_adds_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_adds_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddsw
__m128i _mm_maskz_adds_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_adds_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddsw
__m256i _mm256_adds_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_adds_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:256] := 0
Performance
vpaddsw
__m256i _mm256_mask_adds_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_adds_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddsw
__m256i _mm256_maskz_adds_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_adds_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddsw
__m512i _mm512_adds_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_adds_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:512] := 0
vpaddsw
__m512i _mm512_mask_adds_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_adds_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddsw
__m512i _mm512_maskz_adds_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_adds_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsw
CPUID Flags: AVX512BW
Description
Add packed 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddsb
__m128i _mm_adds_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_adds_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddsb xmm, xmm
CPUID Flags: SSE2
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
Performance
vpaddsb
__m128i _mm_mask_adds_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_adds_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddsb
__m128i _mm_maskz_adds_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_adds_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddsb
__m256i _mm256_adds_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_adds_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:256] := 0
Performance
vpaddsb
__m256i _mm256_mask_adds_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_adds_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddsb
__m256i _mm256_maskz_adds_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_adds_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddsb
__m512i _mm512_adds_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_adds_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:512] := 0
vpaddsb
__m512i _mm512_mask_adds_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_adds_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddsb
__m512i _mm512_maskz_adds_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_adds_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddsb
CPUID Flags: AVX512BW
Description
Add packed 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddusw
__m128i _mm_adds_epu16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_adds_epu16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddusw xmm, xmm
CPUID Flags: SSE2
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
Performance
vpaddusw
__m128i _mm_mask_adds_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_adds_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddusw
__m128i _mm_maskz_adds_epu16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_adds_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddusw
__m256i _mm256_adds_epu16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_adds_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:256] := 0
Performance
vpaddusw
__m256i _mm256_mask_adds_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_adds_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddusw
__m256i _mm256_maskz_adds_epu16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_adds_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddusw
__m512i _mm512_adds_epu16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_adds_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512BW
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:512] := 0
vpaddusw
__m512i _mm512_mask_adds_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_adds_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512BW
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddusw
__m512i _mm512_maskz_adds_epu16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_adds_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusw
CPUID Flags: AVX512BW
Description
Add packed unsigned 16-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
paddusb
__m128i _mm_adds_epu8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_adds_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: paddusb xmm, xmm
CPUID Flags: SSE2
Description
Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
Performance
vpaddusb
__m128i _mm_mask_adds_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_adds_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpaddusb
__m128i _mm_maskz_adds_epu8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_adds_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpaddusb
__m256i _mm256_adds_epu8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_adds_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:256] := 0
Performance
vpaddusb
__m256i _mm256_mask_adds_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_adds_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpaddusb
__m256i _mm256_maskz_adds_epu8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_adds_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512VL + AVX512BW
Description
Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpaddusb
__m512i _mm512_adds_epu8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_adds_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512BW
Description
Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:512] := 0
vpaddusb
__m512i _mm512_mask_adds_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_adds_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512BW
Description
Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpaddusb
__m512i _mm512_maskz_adds_epu8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_adds_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpaddusb
CPUID Flags: AVX512BW
Description
Add packed unsigned 8-bit integers in a and b using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpaddsetcd
__m512i _mm512_addsetc_epi32 (__m512i v2, __m512i v3, __mmask16 * k2_res)
Synopsis
__m512i _mm512_addsetc_epi32 (__m512i v2, __m512i v3, __mmask16 * k2_res)
#include "immintrin.h"
Instruction: vpaddsetcd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition of packed 32-bit integer elements in v2 and v3, storing the resultant carry in k2_res (carry flag) and the addition results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
k2_res[j] := Carry(v2[i+31:i] + v3[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpaddsetcd
__m512i _mm512_mask_addsetc_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * k2_res)
Synopsis
__m512i _mm512_mask_addsetc_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * k2_res)
#include "immintrin.h"
Instruction: vpaddsetcd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element addition of packed 32-bit integer elements in v2 and v3, storing the resultant carry in k2_res (carry flag) and the addition results in dst using writemask k (elements are copied from v2 and k_old when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
ELSE
dst[i+31:i] := v2[i+31:i]
k2_res[j] := k_old[j]
FI
ENDFOR
dst[MAX:512] := 0
vpaddsetsd
__m512i _mm512_addsets_epi32 (__m512i v2, __m512i v3, __mmask16 * sign)
Synopsis
__m512i _mm512_addsets_epi32 (__m512i v2, __m512i v3, __mmask16 * sign)
#include "immintrin.h"
Instruction: vpaddsetsd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs an element-by-element addition of packed 32-bit integer elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag).
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
ENDFOR
dst[MAX:512] := 0
vpaddsetsd
__m512i _mm512_mask_addsets_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3, __mmask16 * sign)
Synopsis
__m512i _mm512_mask_addsets_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3, __mmask16 * sign)
#include "immintrin.h"
Instruction: vpaddsetsd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs an element-by-element addition of packed 32-bit integer elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag). Results are stored using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddsetsps
__m512 _mm512_addsets_ps (__m512 v2, __m512 v3, __mmask16 * sign)
Synopsis
__m512 _mm512_addsets_ps (__m512 v2, __m512 v3, __mmask16 * sign)
#include "immintrin.h"
Instruction: vaddsetsps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag).
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
ENDFOR
dst[MAX:512] := 0
vaddsetsps
__m512 _mm512_mask_addsets_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, __mmask16 * sign)
Synopsis
__m512 _mm512_mask_addsets_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, __mmask16 * sign)
#include "immintrin.h"
Instruction: vaddsetsps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in v2 and v3, storing the results in dst and the sign of the sum in sign (sign flag). Results are stored using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vaddsetsps
__m512 _mm512_addsets_round_ps (__m512 v2, __m512 v3, __mmask16 * sign, int rounding)
Synopsis
__m512 _mm512_addsets_round_ps (__m512 v2, __m512 v3, __mmask16 * sign, int rounding)
#include "immintrin.h"
Instruction: vaddsetsps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in
v2 and
v3, storing the results in
dst and the sign of the sum in
sign (sign flag).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
ENDFOR
dst[MAX:512] := 0
vaddsetsps
__m512 _mm512_mask_addsets_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, __mmask16 * sign, int rounding)
Synopsis
__m512 _mm512_mask_addsets_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, __mmask16 * sign, int rounding)
#include "immintrin.h"
Instruction: vaddsetsps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs an element-by-element addition of packed single-precision (32-bit) floating-point elements in
v2 and
v3, storing the results in
dst and the sign of the sum in
sign (sign flag). Results are stored using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := v2[i+31:i] + v3[i+31:i]
sign[j] := v2[i+31:i] & v3[i+31:i] & 0x80000000
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
addsubpd
__m128d _mm_addsub_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_addsub_pd (__m128d a, __m128d b)
#include "pmmintrin.h"
Instruction: addsubpd xmm, xmm
CPUID Flags: SSE3
Description
Alternatively add and subtract packed double-precision (64-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF (j is even)
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i] + b[i+63:i]
FI
ENDFOR
Performance
vaddsubpd
__m256d _mm256_addsub_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_addsub_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vaddsubpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Alternatively add and subtract packed double-precision (64-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF (j is even)
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i] + b[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
addsubps
__m128 _mm_addsub_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_addsub_ps (__m128 a, __m128 b)
#include "pmmintrin.h"
Instruction: addsubps xmm, xmm
CPUID Flags: SSE3
Description
Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF (j is even)
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i] + b[i+31:i]
FI
ENDFOR
Performance
vaddsubps
__m256 _mm256_addsub_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_addsub_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vaddsubps ymm, ymm, ymm
CPUID Flags: AVX
Description
Alternatively add and subtract packed single-precision (32-bit) floating-point elements in a to/from packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF (j is even)
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i] + b[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
aesdec
__m128i _mm_aesdec_si128 (__m128i a, __m128i RoundKey)
Synopsis
__m128i _mm_aesdec_si128 (__m128i a, __m128i RoundKey)
#include "wmmintrin.h"
Instruction: aesdec xmm, xmm
CPUID Flags: AES
Description
Perform one round of an AES decryption flow on data (state) in a using the round key in RoundKey, and store the result in dst."
Operation
state := a
a[127:0] := InvShiftRows(a[127:0])
a[127:0] := InvSubBytes(a[127:0])
a[127:0] := InvMixColumns(a[127:0])
dst[127:0] := a[127:0] XOR RoundKey[127:0]
Performance
aesdeclast
__m128i _mm_aesdeclast_si128 (__m128i a, __m128i RoundKey)
Synopsis
__m128i _mm_aesdeclast_si128 (__m128i a, __m128i RoundKey)
#include "wmmintrin.h"
Instruction: aesdeclast xmm, xmm
CPUID Flags: AES
Description
Perform the last round of an AES decryption flow on data (state) in a using the round key in RoundKey, and store the result in dst."
Operation
state := a
a[127:0] := InvShiftRows(a[127:0])
a[127:0] := InvSubBytes(a[127:0])
dst[127:0] := a[127:0] XOR RoundKey[127:0]
Performance
aesenc
__m128i _mm_aesenc_si128 (__m128i a, __m128i RoundKey)
Synopsis
__m128i _mm_aesenc_si128 (__m128i a, __m128i RoundKey)
#include "wmmintrin.h"
Instruction: aesenc xmm, xmm
CPUID Flags: AES
Description
Perform one round of an AES encryption flow on data (state) in a using the round key in RoundKey, and store the result in dst."
Operation
state := a
a[127:0] := ShiftRows(a[127:0])
a[127:0] := SubBytes(a[127:0])
a[127:0] := MixColumns(a[127:0])
dst[127:0] := a[127:0] XOR RoundKey[127:0]
Performance
aesenclast
__m128i _mm_aesenclast_si128 (__m128i a, __m128i RoundKey)
Synopsis
__m128i _mm_aesenclast_si128 (__m128i a, __m128i RoundKey)
#include "wmmintrin.h"
Instruction: aesenclast xmm, xmm
CPUID Flags: AES
Description
Perform the last round of an AES encryption flow on data (state) in a using the round key in RoundKey, and store the result in dst."
Operation
state := a
a[127:0] := ShiftRows(a[127:0])
a[127:0] := SubBytes(a[127:0])
dst[127:0] := a[127:0] XOR RoundKey[127:0]
Performance
aesimc
__m128i _mm_aesimc_si128 (__m128i a)
Synopsis
__m128i _mm_aesimc_si128 (__m128i a)
#include "wmmintrin.h"
Instruction: aesimc xmm, xmm
CPUID Flags: AES
Description
Perform the InvMixColumns transformation on a and store the result in dst.
Operation
dst[127:0] := InvMixColumns(a[127:0])
Performance
aeskeygenassist
__m128i _mm_aeskeygenassist_si128 (__m128i a, const int imm8)
Synopsis
__m128i _mm_aeskeygenassist_si128 (__m128i a, const int imm8)
#include "wmmintrin.h"
Instruction: aeskeygenassist xmm, xmm, imm
CPUID Flags: AES
Description
Assist in expanding the AES cipher key by computing steps towards generating a round key for encryption cipher using data from a and an 8-bit round constant specified in imm8, and store the result in dst."
Operation
X3[31:0] := a[127:96]
X2[31:0] := a[95:64]
X1[31:0] := a[63:32]
X0[31:0] := a[31:0]
RCON[31:0] := ZeroExtend(imm8[7:0]);
dst[31:0] := SubWord(X1)
dst[63:32] := (RotWord(SubWord(X1)) XOR RCON;
dst[95:64] := SubWord(X3)
dst[127:96] := RotWord(SubWord(X3)) XOR RCON;
Performance
valignd
__m128i _mm_alignr_epi32 (__m128i a, __m128i b, const int count)
Synopsis
__m128i _mm_alignr_epi32 (__m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 32-byte immediate result, shift the result right by count 32-bit elements, and store the low 16 bytes (4 elements) in dst.
Operation
temp[255:128] := a[127:0]
temp[127:0] := b[127:0]
temp[255:0] := temp[255:0] >> (32*count)
dst[127:0] := temp[127:0]
dst[MAX:128] := 0
valignd
__m128i _mm_mask_alignr_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b, const int count)
Synopsis
__m128i _mm_mask_alignr_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 32-byte immediate result, shift the result right by count 32-bit elements, and store the low 16 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
temp[255:128] := a[127:0]
temp[127:0] := b[127:0]
temp[255:0] := temp[255:0] >> (32*count)
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := temp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
valignd
__m128i _mm_maskz_alignr_epi32 (__mmask8 k, __m128i a, __m128i b, const int count)
Synopsis
__m128i _mm_maskz_alignr_epi32 (__mmask8 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 32-byte immediate result, shift the result right by count 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
temp[255:128] := a[127:0]
temp[127:0] := b[127:0]
temp[255:0] := temp[255:0] >> (32*count)
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := temp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
valignd
__m256i _mm256_alignr_epi32 (__m256i a, __m256i b, const int count)
Synopsis
__m256i _mm256_alignr_epi32 (__m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 64-byte immediate result, shift the result right by count 32-bit elements, and store the low 32 bytes (8 elements) in dst.
Operation
temp[511:256] := a[255:0]
temp[255:0] := b[255:0]
temp[511:0] := temp[511:0] >> (32*count)
dst[255:0] := temp[255:0]
dst[MAX:256] := 0
valignd
__m256i _mm256_mask_alignr_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int count)
Synopsis
__m256i _mm256_mask_alignr_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 64-byte immediate result, shift the result right by count 32-bit elements, and store the low 32 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
temp[511:256] := a[255:0]
temp[255:0] := b[255:0]
temp[511:0] := temp[511:0] >> (32*count)
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := temp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
valignd
__m256i _mm256_maskz_alignr_epi32 (__mmask8 k, __m256i a, __m256i b, const int count)
Synopsis
__m256i _mm256_maskz_alignr_epi32 (__mmask8 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignd
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 64-byte immediate result, shift the result right by count 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
temp[511:256] := a[255:0]
temp[255:0] := b[255:0]
temp[511:0] := temp[511:0] >> (32*count)
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := temp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
valignd
__m512i _mm512_alignr_epi32 (__m512i a, __m512i b, const int count)
Synopsis
__m512i _mm512_alignr_epi32 (__m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Concatenate a and b into a 128-byte immediate result, shift the result right by count 32-bit elements, and store the low 64 bytes (16 elements) in dst.
Operation
temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (32*count)
dst[511:0] := temp[511:0]
dst[MAX:512] := 0
valignd
__m512i _mm512_mask_alignr_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b, const int count)
Synopsis
__m512i _mm512_mask_alignr_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Concatenate a and b into a 128-byte immediate result, shift the result right by count 32-bit elements, and store the low 64 bytes (16 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (32*count)
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := temp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
valignd
__m512i _mm512_maskz_alignr_epi32 (__mmask16 k, __m512i a, __m512i b, const int count)
Synopsis
__m512i _mm512_maskz_alignr_epi32 (__mmask16 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Concatenate a and b into a 128-byte immediate result, shift the result right by count 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (32*count)
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := temp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
valignq
__m128i _mm_alignr_epi64 (__m128i a, __m128i b, const int count)
Synopsis
__m128i _mm_alignr_epi64 (__m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 32-byte immediate result, shift the result right by count 64-bit elements, and store the low 16 bytes (2 elements) in dst.
Operation
temp[255:128] := a[127:0]
temp[127:0] := b[127:0]
temp[255:0] := temp[255:0] >> (64*count)
dst[127:0] := temp[127:0]
dst[MAX:128] := 0
valignq
__m128i _mm_mask_alignr_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b, const int count)
Synopsis
__m128i _mm_mask_alignr_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 32-byte immediate result, shift the result right by count 64-bit elements, and store the low 16 bytes (2 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
temp[255:128] := a[127:0]
temp[127:0] := b[127:0]
temp[255:0] := temp[255:0] >> (64*count)
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := temp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
valignq
__m128i _mm_maskz_alignr_epi64 (__mmask8 k, __m128i a, __m128i b, const int count)
Synopsis
__m128i _mm_maskz_alignr_epi64 (__mmask8 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 32-byte immediate result, shift the result right by count 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
temp[255:128] := a[127:0]
temp[127:0] := b[127:0]
temp[255:0] := temp[255:0] >> (64*count)
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := temp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
valignq
__m256i _mm256_alignr_epi64 (__m256i a, __m256i b, const int count)
Synopsis
__m256i _mm256_alignr_epi64 (__m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 64-byte immediate result, shift the result right by count 64-bit elements, and store the low 32 bytes (4 elements) in dst.
Operation
temp[511:256] := a[255:0]
temp[255:0] := b[255:0]
temp[511:0] := temp[511:0] >> (64*count)
dst[255:0] := temp[255:0]
dst[MAX:256] := 0
valignq
__m256i _mm256_mask_alignr_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int count)
Synopsis
__m256i _mm256_mask_alignr_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 64-byte immediate result, shift the result right by count 64-bit elements, and store the low 32 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
temp[511:256] := a[255:0]
temp[255:0] := b[255:0]
temp[511:0] := temp[511:0] >> (64*count)
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := temp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
valignq
__m256i _mm256_maskz_alignr_epi64 (__mmask8 k, __m256i a, __m256i b, const int count)
Synopsis
__m256i _mm256_maskz_alignr_epi64 (__mmask8 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: valignq
CPUID Flags: AVX512F + AVX512VL
Description
Concatenate a and b into a 64-byte immediate result, shift the result right by count 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
temp[511:256] := a[255:0]
temp[255:0] := b[255:0]
temp[511:0] := temp[511:0] >> (64*count)
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := temp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
valignq
__m512i _mm512_alignr_epi64 (__m512i a, __m512i b, const int count)
Synopsis
__m512i _mm512_alignr_epi64 (__m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Concatenate a and b into a 128-byte immediate result, shift the result right by count 64-bit elements, and store the low 64 bytes (8 elements) in dst.
Operation
temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (64*count)
dst[511:0] := temp[511:0]
dst[MAX:512] := 0
valignq
__m512i _mm512_mask_alignr_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b, const int count)
Synopsis
__m512i _mm512_mask_alignr_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Concatenate a and b into a 128-byte immediate result, shift the result right by count 64-bit elements, and store the low 64 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (64*count)
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := temp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
valignq
__m512i _mm512_maskz_alignr_epi64 (__mmask8 k, __m512i a, __m512i b, const int count)
Synopsis
__m512i _mm512_maskz_alignr_epi64 (__mmask8 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: valignq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Concatenate a and b into a 128-byte immediate result, shift the result right by count 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
temp[1023:512] := a[511:0]
temp[511:0] := b[511:0]
temp[1023:0] := temp[1023:0] >> (64*count)
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := temp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
palignr
__m128i _mm_alignr_epi8 (__m128i a, __m128i b, int count)
Synopsis
__m128i _mm_alignr_epi8 (__m128i a, __m128i b, int count)
#include "tmmintrin.h"
Instruction: palignr xmm, xmm, imm
CPUID Flags: SSSE3
Description
Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst.
Operation
tmp[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8)
dst[127:0] := tmp[127:0]
Performance
vpalignr
__m128i _mm_mask_alignr_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b, const int count)
Synopsis
__m128i _mm_mask_alignr_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512VL + AVX512BW
Description
Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8)
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpalignr
__m128i _mm_maskz_alignr_epi8 (__mmask16 k, __m128i a, __m128i b, const int count)
Synopsis
__m128i _mm_maskz_alignr_epi8 (__mmask16 k, __m128i a, __m128i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512VL + AVX512BW
Description
Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8)
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpalignr
__m256i _mm256_alignr_epi8 (__m256i a, __m256i b, const int count)
Synopsis
__m256i _mm256_alignr_epi8 (__m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: vpalignr ymm, ymm, ymm, imm
CPUID Flags: AVX2
Description
Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst.
Operation
FOR j := 0 to 1
i := j*128
tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
dst[i+127:i] := tmp[127:0]
ENDFOR
dst[MAX:256] := 0
Performance
vpalignr
__m256i _mm256_mask_alignr_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b, const int count)
Synopsis
__m256i _mm256_mask_alignr_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512VL + AVX512BW
Description
Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*128
tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
tmp_dst[i+127:i] := tmp[127:0]
ENDFOR
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpalignr
__m256i _mm256_maskz_alignr_epi8 (__mmask32 k, __m256i a, __m256i b, const int count)
Synopsis
__m256i _mm256_maskz_alignr_epi8 (__mmask32 k, __m256i a, __m256i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512VL + AVX512BW
Description
Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*128
tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
tmp_dst[i+127:i] := tmp[127:0]
ENDFOR
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpalignr
__m512i _mm512_alignr_epi8 (__m512i a, __m512i b, const int count)
Synopsis
__m512i _mm512_alignr_epi8 (__m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512BW
Description
Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst.
Operation
FOR j := 0 to 3
i := j*128
tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
dst[i+127:i] := tmp[127:0]
ENDFOR
dst[MAX:512] := 0
vpalignr
__m512i _mm512_mask_alignr_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b, const int count)
Synopsis
__m512i _mm512_mask_alignr_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512BW
Description
Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*128
tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
tmp_dst[i+127:i] := tmp[127:0]
ENDFOR
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpalignr
__m512i _mm512_maskz_alignr_epi8 (__mmask64 k, __m512i a, __m512i b, const int count)
Synopsis
__m512i _mm512_maskz_alignr_epi8 (__mmask64 k, __m512i a, __m512i b, const int count)
#include "immintrin.h"
Instruction: vpalignr
CPUID Flags: AVX512BW
Description
Concatenate pairs of 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*128
tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
tmp_dst[i+127:i] := tmp[127:0]
ENDFOR
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
palignr
__m64 _mm_alignr_pi8 (__m64 a, __m64 b, int count)
Synopsis
__m64 _mm_alignr_pi8 (__m64 a, __m64 b, int count)
#include "tmmintrin.h"
Instruction: palignr mm, mm, imm
CPUID Flags: SSSE3
Description
Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift the result right by count bytes, and store the low 16 bytes in dst.
Operation
tmp[127:0] := ((a[63:0] << 64) OR b[63:0]) >> (count[7:0]*8)
dst[63:0] := tmp[63:0]
...
void _allow_cpu_features (unsigned __int64 a)
Synopsis
void _allow_cpu_features (unsigned __int64 a)
#include "immintrin.h"
Description
Treat the processor-specific feature(s) specified in a as available. Multiple features may be OR'd together. See the valid feature flags below:
Operation
_FEATURE_GENERIC_IA32
_FEATURE_FPU
_FEATURE_CMOV
_FEATURE_MMX
_FEATURE_FXSAVE
_FEATURE_SSE
_FEATURE_SSE2
_FEATURE_SSE3
_FEATURE_SSSE3
_FEATURE_SSE4_1
_FEATURE_SSE4_2
_FEATURE_MOVBE
_FEATURE_POPCNT
_FEATURE_PCLMULQDQ
_FEATURE_AES
_FEATURE_F16C
_FEATURE_AVX
_FEATURE_RDRND
_FEATURE_FMA
_FEATURE_BMI
_FEATURE_LZCNT
_FEATURE_HLE
_FEATURE_RTM
_FEATURE_AVX2
_FEATURE_KNCNI
_FEATURE_AVX512F
_FEATURE_ADX
_FEATURE_RDSEED
_FEATURE_AVX512ER
_FEATURE_AVX512PF
_FEATURE_AVX512CD
_FEATURE_SHA
_FEATURE_MPX
vpandd
__m128i _mm_mask_and_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_and_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] AND b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpandd
__m128i _mm_maskz_and_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_and_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] AND b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpandd
__m256i _mm256_mask_and_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_and_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpandd
__m256i _mm256_maskz_and_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_and_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpandd
__m512i _mm512_and_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_and_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] BITWISE AND b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vpandd
__m512i _mm512_mask_and_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3)
Synopsis
__m512i _mm512_mask_and_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Performs element-by-element bitwise AND between packed 32-bit integer elements of v2 and v3, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := v2[i+31:i] & v3[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpandd
__m512i _mm512_maskz_and_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_and_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] AND b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpandq
__m128i _mm_mask_and_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_and_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] AND b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpandq
__m128i _mm_maskz_and_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_and_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] AND b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpandq
__m256i _mm256_mask_and_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_and_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] AND b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpandq
__m256i _mm256_maskz_and_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_and_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] AND b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpandq
__m512i _mm512_and_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_and_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.
Operation
dst[511:0] := (a[511:0] AND b[511:0])
dst[MAX:512] := 0
vpandq
__m512i _mm512_mask_and_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_and_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] AND b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpandq
__m512i _mm512_maskz_and_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_and_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] AND b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
andpd
__m128d _mm_and_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_and_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: andpd xmm, xmm
CPUID Flags: SSE2
Description
Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ENDFOR
Performance
vandpd
__m128d _mm_mask_and_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_and_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vandpd
__m128d _mm_maskz_and_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_and_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vandpd
__m256d _mm256_and_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_and_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
vandpd
__m256d _mm256_mask_and_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_and_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vandpd
__m256d _mm256_maskz_and_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_and_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vandpd
__m512d _mm512_and_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_and_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ENDFOR
dst[MAX:512] := 0
vandpd
__m512d _mm512_mask_and_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_and_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vandpd
__m512d _mm512_maskz_and_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_and_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] AND b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
andps
__m128 _mm_and_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_and_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: andps xmm, xmm
CPUID Flags: SSE
Description
Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ENDFOR
Performance
vandps
__m128 _mm_mask_and_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_and_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vandps
__m128 _mm_maskz_and_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_and_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vandps
__m256 _mm256_and_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_and_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandps ymm, ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vandps
__m256 _mm256_mask_and_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_and_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vandps
__m256 _mm256_maskz_and_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_and_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vandps
__m512 _mm512_and_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_and_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vandps
__m512 _mm512_mask_and_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_and_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vandps
__m512 _mm512_maskz_and_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_and_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandps
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pand
__m128i _mm_and_si128 (__m128i a, __m128i b)
Synopsis
__m128i _mm_and_si128 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pand xmm, xmm
CPUID Flags: SSE2
Description
Compute the bitwise AND of 128 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[127:0] := (a[127:0] AND b[127:0])
Performance
vpand
__m256i _mm256_and_si256 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_and_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpand ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compute the bitwise AND of 256 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[255:0] := (a[255:0] AND b[255:0])
dst[MAX:256] := 0
Performance
vpandd
__m512i _mm512_and_si512 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_and_si512 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[511:0] := (a[511:0] AND b[511:0])
dst[MAX:512] := 0
vpandnd
__m128i _mm_mask_andnot_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_andnot_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandnd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpandnd
__m128i _mm_maskz_andnot_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_andnot_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandnd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpandnd
__m256i _mm256_mask_andnot_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_andnot_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandnd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpandnd
__m256i _mm256_maskz_andnot_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_andnot_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandnd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpandnd
__m512i _mm512_andnot_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_andnot_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vpandnd
__m512i _mm512_mask_andnot_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_andnot_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpandnd
__m512i _mm512_maskz_andnot_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_andnot_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise AND NOT of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (NOT a[i+31:i]) AND b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpandnq
__m128i _mm_mask_andnot_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_andnot_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandnq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpandnq
__m128i _mm_maskz_andnot_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_andnot_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpandnq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpandnq
__m256i _mm256_mask_andnot_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_andnot_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandnq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpandnq
__m256i _mm256_maskz_andnot_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_andnot_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandnq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpandnq
__m512i _mm512_andnot_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_andnot_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND NOT of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.
Operation
dst[511:0] := ((NOT a[511:0]) AND b[511:0])
dst[MAX:512] := 0
vpandnq
__m512i _mm512_mask_andnot_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_andnot_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpandnq
__m512i _mm512_maskz_andnot_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_andnot_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise AND NOT of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (NOT a[i+63:i]) AND b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
andnpd
__m128d _mm_andnot_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_andnot_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: andnpd xmm, xmm
CPUID Flags: SSE2
Description
Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ENDFOR
Performance
vandnpd
__m128d _mm_mask_andnot_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_andnot_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vandnpd
__m128d _mm_maskz_andnot_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_andnot_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vandnpd
__m256d _mm256_andnot_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_andnot_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandnpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
vandnpd
__m256d _mm256_mask_andnot_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_andnot_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vandnpd
__m256d _mm256_maskz_andnot_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_andnot_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vandnpd
__m512d _mm512_andnot_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_andnot_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ENDFOR
dst[MAX:512] := 0
vandnpd
__m512d _mm512_mask_andnot_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_andnot_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vandnpd
__m512d _mm512_maskz_andnot_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_andnot_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vandnpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND NOT of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
andnps
__m128 _mm_andnot_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_andnot_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: andnps xmm, xmm
CPUID Flags: SSE
Description
Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ENDFOR
Performance
vandnps
__m128 _mm_mask_andnot_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_andnot_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vandnps
__m128 _mm_maskz_andnot_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_andnot_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vandnps
__m256 _mm256_andnot_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_andnot_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandnps ymm, ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vandnps
__m256 _mm256_mask_andnot_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_andnot_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vandnps
__m256 _mm256_maskz_andnot_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_andnot_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vandnps
__m512 _mm512_andnot_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_andnot_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vandnps
__m512 _mm512_mask_andnot_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_andnot_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vandnps
__m512 _mm512_maskz_andnot_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_andnot_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vandnps
CPUID Flags: AVX512DQ
Description
Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pandn
__m128i _mm_andnot_si128 (__m128i a, __m128i b)
Synopsis
__m128i _mm_andnot_si128 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pandn xmm, xmm
CPUID Flags: SSE2
Description
Compute the bitwise AND NOT of 128 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[127:0] := ((NOT a[127:0]) AND b[127:0])
Performance
vpandn
__m256i _mm256_andnot_si256 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_andnot_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpandn ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compute the bitwise AND NOT of 256 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[255:0] := ((NOT a[255:0]) AND b[255:0])
dst[MAX:256] := 0
Performance
vpandnd
__m512i _mm512_andnot_si512 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_andnot_si512 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpandnd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND NOT of 512 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[511:0] := ((NOT a[511:0]) AND b[511:0])
dst[MAX:512] := 0
...
__m128d _mm_asin_pd (__m128d a)
Synopsis
__m128d _mm_asin_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ASIN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_asin_pd (__m256d a)
Synopsis
__m256d _mm256_asin_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ASIN(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_asin_pd (__m512d a)
Synopsis
__m512d _mm512_asin_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ASIN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_asin_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_asin_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ASIN(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_asin_ps (__m128 a)
Synopsis
__m128 _mm_asin_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ASIN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_asin_ps (__m256 a)
Synopsis
__m256 _mm256_asin_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ASIN(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_asin_ps (__m512 a)
Synopsis
__m512 _mm512_asin_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ASIN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_asin_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_asin_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ASIN(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_asinh_pd (__m128d a)
Synopsis
__m128d _mm_asinh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ASINH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_asinh_pd (__m256d a)
Synopsis
__m256d _mm256_asinh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ASINH(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_asinh_pd (__m512d a)
Synopsis
__m512d _mm512_asinh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ASINH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_asinh_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_asinh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ASINH(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_asinh_ps (__m128 a)
Synopsis
__m128 _mm_asinh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ASINH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_asinh_ps (__m256 a)
Synopsis
__m256 _mm256_asinh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ASINH(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_asinh_ps (__m512 a)
Synopsis
__m512 _mm512_asinh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ASINH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_asinh_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_asinh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ASINH(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_atan_pd (__m128d a)
Synopsis
__m128d _mm_atan_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ATAN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_atan_pd (__m256d a)
Synopsis
__m256d _mm256_atan_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ATAN(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_atan_pd (__m512d a)
Synopsis
__m512d _mm512_atan_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a and store the results in dst expressed in radians.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ATAN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_atan_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_atan_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a, and store the results in dst expressed in radians using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ATAN(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_atan_ps (__m128 a)
Synopsis
__m128 _mm_atan_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ATAN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_atan_ps (__m256 a)
Synopsis
__m256 _mm256_atan_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ATAN(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_atan_ps (__m512 a)
Synopsis
__m512 _mm512_atan_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a, and store the results in dst expressed in radians.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ATAN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_atan_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_atan_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ATAN(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_atan2_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_atan2_pd (__m128d a, __m128d b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_atan2_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_atan2_pd (__m256d a, __m256d b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_atan2_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_atan2_pd (__m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_atan2_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_atan2_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to
i := j*64
IF k[j]
dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_atan2_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_atan2_ps (__m128 a, __m128 b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_atan2_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_atan2_ps (__m256 a, __m256 b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_atan2_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_atan2_ps (__m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_atan2_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_atan2_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in a divided by packed elements in b, and store the results in dst expressed in radians using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_atanh_pd (__m128d a)
Synopsis
__m128d _mm_atanh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ATANH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_atanh_pd (__m256d a)
Synopsis
__m256d _mm256_atanh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ATANH(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_atanh_pd (__m512d a)
Synopsis
__m512d _mm512_atanh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a and store the results in dst expressed in radians.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ATANH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_atanh_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_atanh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a, and store the results in dst expressed in radians using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ATANH(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_atanh_ps (__m128 a)
Synopsis
__m128 _mm_atanh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ATANH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_atanh_ps (__m256 a)
Synopsis
__m256 _mm256_atanh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ATANH(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_atanh_ps (__m512 a)
Synopsis
__m512 _mm512_atanh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperblic tangent of packed single-precision (32-bit) floating-point elements in a, and store the results in dst expressed in radians.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ATANH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_atanh_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_atanh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ATANH(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
pavgw
__m128i _mm_avg_epu16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_avg_epu16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pavgw xmm, xmm
CPUID Flags: SSE2
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR
Performance
vpavgw
__m128i _mm_mask_avg_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_avg_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512VL + AVX512BW
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpavgw
__m128i _mm_maskz_avg_epu16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_avg_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512VL + AVX512BW
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpavgw
__m256i _mm256_avg_epu16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_avg_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR
dst[MAX:256] := 0
Performance
vpavgw
__m256i _mm256_mask_avg_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_avg_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512VL + AVX512BW
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpavgw
__m256i _mm256_maskz_avg_epu16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_avg_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512VL + AVX512BW
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpavgw
__m512i _mm512_avg_epu16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_avg_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512BW
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR
dst[MAX:512] := 0
vpavgw
__m512i _mm512_mask_avg_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_avg_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512BW
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpavgw
__m512i _mm512_maskz_avg_epu16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_avg_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgw
CPUID Flags: AVX512BW
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pavgb
__m128i _mm_avg_epu8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_avg_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pavgb xmm, xmm
CPUID Flags: SSE2
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR
Performance
vpavgb
__m128i _mm_mask_avg_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_avg_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512VL + AVX512BW
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpavgb
__m128i _mm_maskz_avg_epu8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_avg_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512VL + AVX512BW
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpavgb
__m256i _mm256_avg_epu8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_avg_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR
dst[MAX:256] := 0
Performance
vpavgb
__m256i _mm256_mask_avg_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_avg_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512VL + AVX512BW
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpavgb
__m256i _mm256_maskz_avg_epu8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_avg_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512VL + AVX512BW
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpavgb
__m512i _mm512_avg_epu8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_avg_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512BW
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR
dst[MAX:512] := 0
vpavgb
__m512i _mm512_mask_avg_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_avg_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512BW
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpavgb
__m512i _mm512_maskz_avg_epu8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_avg_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpavgb
CPUID Flags: AVX512BW
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pavgw
__m64 _mm_avg_pu16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_avg_pu16 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pavgw mm, mm
CPUID Flags: SSE
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*16
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR
pavgb
__m64 _mm_avg_pu8 (__m64 a, __m64 b)
Synopsis
__m64 _mm_avg_pu8 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pavgb mm, mm
CPUID Flags: SSE
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*8
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR
bextr
unsigned int _bextr_u32 (unsigned int a, unsigned int start, unsigned int len)
Synopsis
unsigned int _bextr_u32 (unsigned int a, unsigned int start, unsigned int len)
#include "immintrin.h"
Instruction: bextr r32, r32, r32
CPUID Flags: BMI1
Description
Extract contiguous bits from unsigned 32-bit integer a, and store the result in dst. Extract the number of bits specified by len, starting at the bit specified by start.
Operation
tmp := ZERO_EXTEND_TO_512(a)
dst := ZERO_EXTEND(tmp[start+len-1:start])
Performance
bextr
unsigned __int64 _bextr_u64 (unsigned __int64 a, unsigned int start, unsigned int len)
Synopsis
unsigned __int64 _bextr_u64 (unsigned __int64 a, unsigned int start, unsigned int len)
#include "immintrin.h"
Instruction: bextr r64, r64, r64
CPUID Flags: BMI1
Description
Extract contiguous bits from unsigned 64-bit integer a, and store the result in dst. Extract the number of bits specified by len, starting at the bit specified by start.
Operation
tmp := ZERO_EXTEND_TO_512(a)
dst := ZERO_EXTEND(tmp[start+len-1:start])
Performance
bsf
int _bit_scan_forward (int a)
Synopsis
int _bit_scan_forward (int a)
#include "immintrin.h"
Instruction: bsf r32, r32
Description
Set dst to the index of the lowest set bit in 32-bit integer a. If no bits are set in a then dst is undefined.
Operation
tmp := 0
IF a = 0
dst := undefined
ELSE
DO WHILE ((tmp < 32) AND a[tmp] = 0)
tmp := tmp + 1
dst := tmp
OD
FI
Performance
bsr
int _bit_scan_reverse (int a)
Synopsis
int _bit_scan_reverse (int a)
#include "immintrin.h"
Instruction: bsr r32, r32
Description
Set dst to the index of the highest set bit in 32-bit integer a. If no bits are set in a then dst is undefined.
Operation
tmp := 31
IF a = 0
dst := undefined
ELSE
DO WHILE ((tmp > 0) AND a[tmp] = 0)
tmp := tmp - 1
dst := tmp
OD
FI
Performance
bsf
unsigned char _BitScanForward (unsigned __int32* index, unsigned __int32 mask)
Synopsis
unsigned char _BitScanForward (unsigned __int32* index, unsigned __int32 mask)
#include "immintrin.h"
Instruction: bsf r32, r32
Description
Set index to the index of the lowest set bit in 32-bit integer mask. If no bits are set in mask, then set dst to 0, otherwise set dst to 1.
Operation
tmp := 0
IF mask = 0
dst := 0
ELSE
DO WHILE ((tmp < 32) AND mask[tmp] = 0)
tmp := tmp + 1
index := tmp
dst := 1
OD
FI
Performance
bsf
unsigned char _BitScanForward64 (unsigned __int32* index, unsigned __int64 mask)
Synopsis
unsigned char _BitScanForward64 (unsigned __int32* index, unsigned __int64 mask)
#include "immintrin.h"
Instruction: bsf r64, r64
Description
Set index to the index of the lowest set bit in 64-bit integer mask. If no bits are set in mask, then set dst to 0, otherwise set dst to 1.
Operation
tmp := 0
IF mask = 0
dst := 0
ELSE
DO WHILE ((tmp < 64) AND mask[tmp] = 0)
tmp := tmp + 1
index := tmp
dst := 1
OD
FI
Performance
bsr
unsigned char _BitScanReverse (unsigned __int32* index, unsigned __int32 mask)
Synopsis
unsigned char _BitScanReverse (unsigned __int32* index, unsigned __int32 mask)
#include "immintrin.h"
Instruction: bsr r32, r32
Description
Set index to the index of the highest set bit in 32-bit integer mask. If no bits are set in mask, then set dst to 0, otherwise set dst to 1.
Operation
tmp := 31
IF mask = 0
dst := 0
ELSE
DO WHILE ((tmp > 0) AND mask[tmp] = 0)
tmp := tmp - 1
index := tmp
dst := 1
OD
FI
Performance
bsr
unsigned char _BitScanReverse64 (unsigned __int32* index, unsigned __int64 mask)
Synopsis
unsigned char _BitScanReverse64 (unsigned __int32* index, unsigned __int64 mask)
#include "immintrin.h"
Instruction: bsr r64, r64
Description
Set index to the index of the highest set bit in 64-bit integer mask. If no bits are set in mask, then set dst to 0, otherwise set dst to 1.
Operation
tmp := 31
IF mask = 0
dst := 0
ELSE
DO WHILE ((tmp > 0) AND mask[tmp] = 0)
tmp := tmp - 1
index := tmp
dst := 1
OD
FI
Performance
bt
unsigned char _bittest (__int32* a, __int32 b)
Synopsis
unsigned char _bittest (__int32* a, __int32 b)
#include "immintrin.h"
Instruction: bt r32, r32
Description
Return the bit at index b of 32-bit integer a.
Operation
dst := a[b]
bt
unsigned char _bittest64 (__int64* a, __int64 b)
Synopsis
unsigned char _bittest64 (__int64* a, __int64 b)
#include "immintrin.h"
Instruction: bt r64, r64
Description
Return the bit at index b of 64-bit integer a.
Operation
dst := a[b]
btc
unsigned char _bittestandcomplement (__int32* a, __int32 b)
Synopsis
unsigned char _bittestandcomplement (__int32* a, __int32 b)
#include "immintrin.h"
Instruction: btc r32, r32
Description
Return the bit at index b of 32-bit integer a, and set that bit to its complement.
Operation
dst := a[b]
a[b] := ~a[b]
btc
unsigned char _bittestandcomplement64 (__int64* a, __int64 b)
Synopsis
unsigned char _bittestandcomplement64 (__int64* a, __int64 b)
#include "immintrin.h"
Instruction: btc r64, r64
Description
Return the bit at index b of 64-bit integer a, and set that bit to its complement.
Operation
dst := a[b]
a[b] := ~a[b]
btr
unsigned char _bittestandreset (__int32* a, __int32 b)
Synopsis
unsigned char _bittestandreset (__int32* a, __int32 b)
#include "immintrin.h"
Instruction: btr r32, r32
Description
Return the bit at index b of 32-bit integer a, and set that bit to zero.
Operation
dst := a[b]
a[b] := 0
btr
unsigned char _bittestandreset64 (__int64* a, __int64 b)
Synopsis
unsigned char _bittestandreset64 (__int64* a, __int64 b)
#include "immintrin.h"
Instruction: btr r64, r64
Description
Return the bit at index b of 64-bit integer a, and set that bit to zero.
Operation
dst := a[b]
a[b] := 0
bts
unsigned char _bittestandset (__int32* a, __int32 b)
Synopsis
unsigned char _bittestandset (__int32* a, __int32 b)
#include "immintrin.h"
Instruction: bts r32, r32
Description
Return the bit at index b of 32-bit integer a, and set that bit to one.
Operation
dst := a[b]
a[b] := 1
bts
unsigned char _bittestandset64 (__int64* a, __int64 b)
Synopsis
unsigned char _bittestandset64 (__int64* a, __int64 b)
#include "immintrin.h"
Instruction: bts r64, r64
Description
Return the bit at index b of 64-bit integer a, and set that bit to one.
Operation
dst := a[b]
a[b] := 1
pblendw
__m128i _mm_blend_epi16 (__m128i a, __m128i b, const int imm8)
Synopsis
__m128i _mm_blend_epi16 (__m128i a, __m128i b, const int imm8)
#include "smmintrin.h"
Instruction: pblendw xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Blend packed 16-bit integers from a and b using control mask imm8, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF imm8[j%8]
dst[i+15:i] := b[i+15:i]
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
Performance
vpblendmw
__m128i _mm_mask_blend_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_blend_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpblendmw
CPUID Flags: AVX512VL + AVX512BW
Description
Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := b[i+15:i]
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpblendw
__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_blend_epi16 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpblendw ymm, ymm, ymm, imm
CPUID Flags: AVX2
Description
Blend packed 16-bit integers from a and b using control mask imm8, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF imm8[j%8]
dst[i+15:i] := b[i+15:i]
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpblendmw
__m256i _mm256_mask_blend_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_blend_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpblendmw
CPUID Flags: AVX512VL + AVX512BW
Description
Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := b[i+15:i]
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpblendmw
__m512i _mm512_mask_blend_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_blend_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpblendmw
CPUID Flags: AVX512BW
Description
Blend packed 16-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := b[i+15:i]
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpblendd
__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
Synopsis
__m128i _mm_blend_epi32 (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpblendd xmm, xmm, xmm, imm
CPUID Flags: AVX2
Description
Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF imm8[j%8]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
Performance
vpblendmd
__m128i _mm_mask_blend_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_blend_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpblendmd
CPUID Flags: AVX512VL + AVX512F
Description
Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpblendd
__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_blend_epi32 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpblendd ymm, ymm, ymm, imm
CPUID Flags: AVX2
Description
Blend packed 32-bit integers from a and b using control mask imm8, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF imm8[j%8]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpblendmd
__m256i _mm256_mask_blend_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_blend_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpblendmd
CPUID Flags: AVX512VL + AVX512F
Description
Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpblendmd
__m512i _mm512_mask_blend_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_blend_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpblendmd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpblendmq
__m128i _mm_mask_blend_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_blend_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpblendmq
CPUID Flags: AVX512VL + AVX512F
Description
Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpblendmq
__m256i _mm256_mask_blend_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_blend_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpblendmq
CPUID Flags: AVX512VL + AVX512F
Description
Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpblendmq
__m512i _mm512_mask_blend_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_blend_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpblendmq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpblendmb
__m128i _mm_mask_blend_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_blend_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpblendmb
CPUID Flags: AVX512VL + AVX512BW
Description
Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := b[i+7:i]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpblendmb
__m256i _mm256_mask_blend_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_blend_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpblendmb
CPUID Flags: AVX512VL + AVX512BW
Description
Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := b[i+7:i]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpblendmb
__m512i _mm512_mask_blend_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_blend_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpblendmb
CPUID Flags: AVX512BW
Description
Blend packed 8-bit integers from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := b[i+7:i]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
blendpd
__m128d _mm_blend_pd (__m128d a, __m128d b, const int imm8)
Synopsis
__m128d _mm_blend_pd (__m128d a, __m128d b, const int imm8)
#include "smmintrin.h"
Instruction: blendpd xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF imm8[j%8]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
Performance
vblendmpd
__m128d _mm_mask_blend_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_blend_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vblendmpd
CPUID Flags: AVX512VL + AVX512F
Description
Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vblendpd
__m256d _mm256_blend_pd (__m256d a, __m256d b, const int imm8)
Synopsis
__m256d _mm256_blend_pd (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vblendpd ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Blend packed double-precision (64-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF imm8[j%8]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vblendmpd
__m256d _mm256_mask_blend_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_blend_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vblendmpd
CPUID Flags: AVX512VL + AVX512F
Description
Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vblendmpd
__m512d _mm512_mask_blend_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_blend_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vblendmpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
blendps
__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_blend_ps (__m128 a, __m128 b, const int imm8)
#include "smmintrin.h"
Instruction: blendps xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF imm8[j%8]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
Performance
vblendmps
__m128 _mm_mask_blend_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_blend_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vblendmps
CPUID Flags: AVX512VL + AVX512F
Description
Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vblendps
__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
Synopsis
__m256 _mm256_blend_ps (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vblendps ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Blend packed single-precision (32-bit) floating-point elements from a and b using control mask imm8, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF imm8[j%8]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vblendmps
__m256 _mm256_mask_blend_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_blend_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vblendmps
CPUID Flags: AVX512VL + AVX512F
Description
Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vblendmps
__m512 _mm512_mask_blend_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_blend_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vblendmps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
pblendvb
__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask)
Synopsis
__m128i _mm_blendv_epi8 (__m128i a, __m128i b, __m128i mask)
#include "smmintrin.h"
Instruction: pblendvb xmm, xmm
CPUID Flags: SSE4.1
Description
Blend packed 8-bit integers from a and b using mask, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
IF mask[i+7]
dst[i+7:i] := b[i+7:i]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
Performance
vpblendvb
__m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask)
Synopsis
__m256i _mm256_blendv_epi8 (__m256i a, __m256i b, __m256i mask)
#include "immintrin.h"
Instruction: vpblendvb ymm, ymm, ymm, ymm
CPUID Flags: AVX2
Description
Blend packed 8-bit integers from a and b using mask, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
IF mask[i+7]
dst[i+7:i] := b[i+7:i]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
blendvpd
__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask)
Synopsis
__m128d _mm_blendv_pd (__m128d a, __m128d b, __m128d mask)
#include "smmintrin.h"
Instruction: blendvpd xmm, xmm
CPUID Flags: SSE4.1
Description
Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF mask[i+63]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
Performance
vblendvpd
__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
Synopsis
__m256d _mm256_blendv_pd (__m256d a, __m256d b, __m256d mask)
#include "immintrin.h"
Instruction: vblendvpd ymm, ymm, ymm, ymm
CPUID Flags: AVX
Description
Blend packed double-precision (64-bit) floating-point elements from a and b using mask, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF mask[i+63]
dst[i+63:i] := b[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
blendvps
__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask)
Synopsis
__m128 _mm_blendv_ps (__m128 a, __m128 b, __m128 mask)
#include "smmintrin.h"
Instruction: blendvps xmm, xmm
CPUID Flags: SSE4.1
Description
Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF mask[i+31]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
Performance
vblendvps
__m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask)
Synopsis
__m256 _mm256_blendv_ps (__m256 a, __m256 b, __m256 mask)
#include "immintrin.h"
Instruction: vblendvps ymm, ymm, ymm, ymm
CPUID Flags: AVX
Description
Blend packed single-precision (32-bit) floating-point elements from a and b using mask, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF mask[i+31]
dst[i+31:i] := b[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
blsi
unsigned int _blsi_u32 (unsigned int a)
Synopsis
unsigned int _blsi_u32 (unsigned int a)
#include "immintrin.h"
Instruction: blsi r32, r32
CPUID Flags: BMI1
Description
Extract the lowest set bit from unsigned 32-bit integer a and set the corresponding bit in dst. All other bits in dst are zeroed, and all bits are zeroed if no bits are set in a.
Operation
dst := (-a) BITWISE AND a
Performance
blsi
unsigned __int64 _blsi_u64 (unsigned __int64 a)
Synopsis
unsigned __int64 _blsi_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: blsi r64, r64
CPUID Flags: BMI1
Description
Extract the lowest set bit from unsigned 64-bit integer a and set the corresponding bit in dst. All other bits in dst are zeroed, and all bits are zeroed if no bits are set in a.
Operation
dst := (-a) BITWISE AND a
Performance
blsmsk
unsigned int _blsmsk_u32 (unsigned int a)
Synopsis
unsigned int _blsmsk_u32 (unsigned int a)
#include "immintrin.h"
Instruction: blsmsk r32, r32
CPUID Flags: BMI1
Description
Set all the lower bits of dst up to and including the lowest set bit in unsigned 32-bit integer a.
Operation
dst := (a - 1) XOR a
Performance
blsmsk
unsigned __int64 _blsmsk_u64 (unsigned __int64 a)
Synopsis
unsigned __int64 _blsmsk_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: blsmsk r64, r64
CPUID Flags: BMI1
Description
Set all the lower bits of dst up to and including the lowest set bit in unsigned 64-bit integer a.
Operation
dst := (a - 1) XOR a
Performance
blsr
unsigned int _blsr_u32 (unsigned int a)
Synopsis
unsigned int _blsr_u32 (unsigned int a)
#include "immintrin.h"
Instruction: blsr r32, r32
CPUID Flags: BMI1
Description
Copy all bits from unsigned 32-bit integer a to dst, and reset (set to 0) the bit in dst that corresponds to the lowest set bit in a.
Operation
dst := (a - 1) BITWISE AND a
Performance
blsr
unsigned __int64 _blsr_u64 (unsigned __int64 a)
Synopsis
unsigned __int64 _blsr_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: blsr r64, r64
CPUID Flags: BMI1
Description
Copy all bits from unsigned 64-bit integer a to dst, and reset (set to 0) the bit in dst that corresponds to the lowest set bit in a.
Operation
dst := (a - 1) BITWISE AND a
Performance
bndcu, bndcn
void _bnd_chk_ptr_bounds (const void * q, size_t size)
Synopsis
void _bnd_chk_ptr_bounds (const void * q, size_t size)
#include "immintrin.h"
Instruction: bndcu bnd, m32
bndcn bnd, m32
CPUID Flags: MPX
Description
Checks if [q, q + size - 1] is within the lower and upper bounds of q and throws a #BR if not.
Operation
IF (q + size - 1) < q.LB OR (q + size - 1) > q.UB THEN
#BR;
FI;
bndcl
void _bnd_chk_ptr_lbounds (const void * q)
Synopsis
void _bnd_chk_ptr_lbounds (const void * q)
#include "immintrin.h"
Instruction: bndcl bnd, m32
CPUID Flags: MPX
Description
Checks if q is within its lower bound, and throws a #BR if not.
Operation
IF q < q.LB THEN
#BR;
FI;
bndcu, bndcn
void _bnd_chk_ptr_ubounds (const void * q)
Synopsis
void _bnd_chk_ptr_ubounds (const void * q)
#include "immintrin.h"
Instruction: bndcu bnd, m32
bndcn bnd, m32
CPUID Flags: MPX
Description
Checks if q is within its upper bound, and throws a #BR if not.
Operation
IF q > q.UB THEN
#BR;
FI;
...
void * _bnd_copy_ptr_bounds (const void * q, const void * r)
Synopsis
void * _bnd_copy_ptr_bounds (const void * q, const void * r)
#include "immintrin.h"
CPUID Flags: MPX
Description
Make a pointer with the value of q and bounds set to the bounds of r (e.g. copy the bounds of r to pointer q), and store the result in dst.
Operation
dst := q;
dst.LB := r.LB;
dst.UB := r.UB;
...
const void * _bnd_get_ptr_lbound (const void * q)
Synopsis
const void * _bnd_get_ptr_lbound (const void * q)
#include "immintrin.h"
CPUID Flags: MPX
Description
Return the lower bound of q.
Operation
dst := q.LB
...
const void * _bnd_get_ptr_ubound (const void * q)
Synopsis
const void * _bnd_get_ptr_ubound (const void * q)
#include "immintrin.h"
CPUID Flags: MPX
Description
Return the upper bound of q.
Operation
dst := q.UB
...
void * _bnd_init_ptr_bounds (const void * q)
Synopsis
void * _bnd_init_ptr_bounds (const void * q)
#include "immintrin.h"
CPUID Flags: MPX
Description
Make a pointer with the value of q and open bounds, which allow the pointer to access the entire virtual address space, and store the result in dst.
Operation
dst := q;
dst.LB := 0;
dst.UB := 0;
...
void * _bnd_narrow_ptr_bounds (const void * q, const void * r, size_t size)
Synopsis
void * _bnd_narrow_ptr_bounds (const void * q, const void * r, size_t size)
#include "immintrin.h"
CPUID Flags: MPX
Description
Narrow the bounds for pointer q to the intersection of the bounds of r and the bounds [q, q + size - 1], and store the result in dst.
Operation
dst := q;
IF r.LB > (q + size - 1) OR r.UB < q THEN
dst.LB := 1;
dst.UB := 0;
ELSE
dst.LB := MAX(r.LB, q);
dst.UB := MIN(r.UB, (q + size - 1));
FI;
bndmk
void * _bnd_set_ptr_bounds (const void * srcmem, size_t size)
Synopsis
void * _bnd_set_ptr_bounds (const void * srcmem, size_t size)
#include "immintrin.h"
Instruction: bndmk bnd, m32
CPUID Flags: MPX
Description
Make a pointer with the value of srcmem and bounds set to [srcmem, srcmem + size - 1], and store the result in dst.
Operation
dst := srcmem;
dst.LB := srcmem.LB;
dst.UB := srcmem + size - 1;
bndstx
void _bnd_store_ptr_bounds (const void ** ptr_addr, const void * ptr_val)
Synopsis
void _bnd_store_ptr_bounds (const void ** ptr_addr, const void * ptr_val)
#include "immintrin.h"
Instruction: bndstx mib, bnd
CPUID Flags: MPX
Description
Stores the bounds of ptr_val pointer in memory at address ptr_addr.
Operation
MEM[ptr_addr].LB := ptr_val.LB;
MEM[ptr_addr].UB := ptr_val.UB;
vbroadcastf32x2
__m256 _mm256_broadcast_f32x2 (__m128 a)
Synopsis
__m256 _mm256_broadcast_f32x2 (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 2)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0
vbroadcastf32x2
__m256 _mm256_mask_broadcast_f32x2 (__m256 src, __mmask8 k, __m128 a)
Synopsis
__m256 _mm256_mask_broadcast_f32x2 (__m256 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastf32x2
__m256 _mm256_maskz_broadcast_f32x2 (__mmask8 k, __m128 a)
Synopsis
__m256 _mm256_maskz_broadcast_f32x2 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastf32x2
__m512 _mm512_broadcast_f32x2 (__m128 a)
Synopsis
__m512 _mm512_broadcast_f32x2 (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512DQ
Description
Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 2)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0
vbroadcastf32x2
__m512 _mm512_mask_broadcast_f32x2 (__m512 src, __mmask16 k, __m128 a)
Synopsis
__m512 _mm512_mask_broadcast_f32x2 (__m512 src, __mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512DQ
Description
Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf32x2
__m512 _mm512_maskz_broadcast_f32x2 (__mmask16 k, __m128 a)
Synopsis
__m512 _mm512_maskz_broadcast_f32x2 (__mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x2
CPUID Flags: AVX512DQ
Description
Broadcast the lower 2 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf32x4
__m256 _mm256_broadcast_f32x4 (__m128 a)
Synopsis
__m256 _mm256_broadcast_f32x4 (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 4)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0
vbroadcastf32x4
__m256 _mm256_mask_broadcast_f32x4 (__m256 src, __mmask8 k, __m128 a)
Synopsis
__m256 _mm256_mask_broadcast_f32x4 (__m256 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 4)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastf32x4
__m256 _mm256_maskz_broadcast_f32x4 (__mmask8 k, __m128 a)
Synopsis
__m256 _mm256_maskz_broadcast_f32x4 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 4)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastf32x4
__m512 _mm512_broadcast_f32x4 (__m128 a)
Synopsis
__m512 _mm512_broadcast_f32x4 (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4 zmm {k}, m128
CPUID Flags: AVX512F
Description
Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 4)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0
vbroadcastf32x4
__m512 _mm512_mask_broadcast_f32x4 (__m512 src, __mmask16 k, __m128 a)
Synopsis
__m512 _mm512_mask_broadcast_f32x4 (__m512 src, __mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4 zmm {k}, m128
CPUID Flags: AVX512F
Description
Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 4)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf32x4
__m512 _mm512_maskz_broadcast_f32x4 (__mmask16 k, __m128 a)
Synopsis
__m512 _mm512_maskz_broadcast_f32x4 (__mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastf32x4 zmm {k}, m128
CPUID Flags: AVX512F
Description
Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 4)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf32x8
__m512 _mm512_broadcast_f32x8 (__m256 a)
Synopsis
__m512 _mm512_broadcast_f32x8 (__m256 a)
#include "immintrin.h"
Instruction: vbroadcastf32x8
CPUID Flags: AVX512DQ
Description
Broadcast the 8 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 8)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0
vbroadcastf32x8
__m512 _mm512_mask_broadcast_f32x8 (__m512 src, __mmask16 k, __m256 a)
Synopsis
__m512 _mm512_mask_broadcast_f32x8 (__m512 src, __mmask16 k, __m256 a)
#include "immintrin.h"
Instruction: vbroadcastf32x8
CPUID Flags: AVX512DQ
Description
Broadcast the 8 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 8)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf32x8
__m512 _mm512_maskz_broadcast_f32x8 (__mmask16 k, __m256 a)
Synopsis
__m512 _mm512_maskz_broadcast_f32x8 (__mmask16 k, __m256 a)
#include "immintrin.h"
Instruction: vbroadcastf32x8
CPUID Flags: AVX512DQ
Description
Broadcast the 8 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 8)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf64x2
__m256d _mm256_broadcast_f64x2 (__m128d a)
Synopsis
__m256d _mm256_broadcast_f64x2 (__m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
Operation
FOR j := 0 to 3
i := j*64
n := (j mod 2)*64
dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:256] := 0
vbroadcastf64x2
__m256d _mm256_mask_broadcast_f64x2 (__m256d src, __mmask8 k, __m128d a)
Synopsis
__m256d _mm256_mask_broadcast_f64x2 (__m256d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
n := (j mod 2)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := src[n+63:n]
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastf64x2
__m256d _mm256_maskz_broadcast_f64x2 (__mmask8 k, __m128d a)
Synopsis
__m256d _mm256_maskz_broadcast_f64x2 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
n := (j mod 2)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastf64x2
__m512d _mm512_broadcast_f64x2 (__m128d a)
Synopsis
__m512d _mm512_broadcast_f64x2 (__m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512DQ
Description
Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 2)*64
dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0
vbroadcastf64x2
__m512d _mm512_mask_broadcast_f64x2 (__m512d src, __mmask8 k, __m128d a)
Synopsis
__m512d _mm512_mask_broadcast_f64x2 (__m512d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512DQ
Description
Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 2)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := src[n+63:n]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf64x2
__m512d _mm512_maskz_broadcast_f64x2 (__mmask8 k, __m128d a)
Synopsis
__m512d _mm512_maskz_broadcast_f64x2 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastf64x2
CPUID Flags: AVX512DQ
Description
Broadcast the 2 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 2)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf64x4
__m512d _mm512_broadcast_f64x4 (__m256d a)
Synopsis
__m512d _mm512_broadcast_f64x4 (__m256d a)
#include "immintrin.h"
Instruction: vbroadcastf64x4 zmm {k}, m256
CPUID Flags: AVX512F
Description
Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 4)*64
dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0
vbroadcastf64x4
__m512d _mm512_mask_broadcast_f64x4 (__m512d src, __mmask8 k, __m256d a)
Synopsis
__m512d _mm512_mask_broadcast_f64x4 (__m512d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vbroadcastf64x4 zmm {k}, m256
CPUID Flags: AVX512F
Description
Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 4)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := src[n+63:n]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf64x4
__m512d _mm512_maskz_broadcast_f64x4 (__mmask8 k, __m256d a)
Synopsis
__m512d _mm512_maskz_broadcast_f64x4 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vbroadcastf64x4 zmm {k}, m256
CPUID Flags: AVX512F
Description
Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 4)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti32x2
__m128i _mm_broadcast_i32x2 (__m128i a)
Synopsis
__m128i _mm_broadcast_i32x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the lower 2 packed 32-bit integers from a to all elements of "dst.
Operation
FOR j := 0 to 3
i := j*32
n := (j mod 2)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:128] := 0
vbroadcasti32x2
__m128i _mm_mask_broadcast_i32x2 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_broadcast_i32x2 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[n+31:n]
FI
ENDFOR
dst[MAX:128] := 0
vbroadcasti32x2
__m128i _mm_maskz_broadcast_i32x2 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_broadcast_i32x2 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vbroadcasti32x2
__m256i _mm256_broadcast_i32x2 (__m128i a)
Synopsis
__m256i _mm256_broadcast_i32x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the lower 2 packed 32-bit integers from a to all elements of "dst.
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 2)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0
vbroadcasti32x2
__m256i _mm256_mask_broadcast_i32x2 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_broadcast_i32x2 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[n+31:n]
FI
ENDFOR
dst[MAX:256] := 0
vbroadcasti32x2
__m256i _mm256_maskz_broadcast_i32x2 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_broadcast_i32x2 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vbroadcasti32x2
__m512i _mm512_broadcast_i32x2 (__m128i a)
Synopsis
__m512i _mm512_broadcast_i32x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512DQ
Description
Broadcast the lower 2 packed 32-bit integers from a to all elements of "dst.
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 2)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0
vbroadcasti32x2
__m512i _mm512_mask_broadcast_i32x2 (__m512i src, __mmask16 k, __m128i a)
Synopsis
__m512i _mm512_mask_broadcast_i32x2 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512DQ
Description
Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[n+31:n]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti32x2
__m512i _mm512_maskz_broadcast_i32x2 (__mmask16 k, __m128i a)
Synopsis
__m512i _mm512_maskz_broadcast_i32x2 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x2
CPUID Flags: AVX512DQ
Description
Broadcast the lower 2 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 2)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti32x4
__m256i _mm256_broadcast_i32x4 (__m128i a)
Synopsis
__m256i _mm256_broadcast_i32x4 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the 4 packed 32-bit integers from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 4)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:256] := 0
vbroadcasti32x4
__m256i _mm256_mask_broadcast_i32x4 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_broadcast_i32x4 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 4)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[n+31:n]
FI
ENDFOR
dst[MAX:256] := 0
vbroadcasti32x4
__m256i _mm256_maskz_broadcast_i32x4 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_broadcast_i32x4 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
n := (j mod 4)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vbroadcasti32x4
__m512i _mm512_broadcast_i32x4 (__m128i a)
Synopsis
__m512i _mm512_broadcast_i32x4 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4 zmm {k}, m128
CPUID Flags: AVX512F
Description
Broadcast the 4 packed 32-bit integers from a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 4)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0
vbroadcasti32x4
__m512i _mm512_mask_broadcast_i32x4 (__m512i src, __mmask16 k, __m128i a)
Synopsis
__m512i _mm512_mask_broadcast_i32x4 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4 zmm {k}, m128
CPUID Flags: AVX512F
Description
Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 4)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[n+31:n]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti32x4
__m512i _mm512_maskz_broadcast_i32x4 (__mmask16 k, __m128i a)
Synopsis
__m512i _mm512_maskz_broadcast_i32x4 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti32x4 zmm {k}, m128
CPUID Flags: AVX512F
Description
Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 4)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti32x8
__m512i _mm512_broadcast_i32x8 (__m256i a)
Synopsis
__m512i _mm512_broadcast_i32x8 (__m256i a)
#include "immintrin.h"
Instruction: vbroadcasti32x8
CPUID Flags: AVX512DQ
Description
Broadcast the 8 packed 32-bit integers from a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 8)*32
dst[i+31:i] := a[n+31:n]
ENDFOR
dst[MAX:512] := 0
vbroadcasti32x8
__m512i _mm512_mask_broadcast_i32x8 (__m512i src, __mmask16 k, __m256i a)
Synopsis
__m512i _mm512_mask_broadcast_i32x8 (__m512i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vbroadcasti32x8
CPUID Flags: AVX512DQ
Description
Broadcast the 8 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 8)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := src[n+31:n]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti32x8
__m512i _mm512_maskz_broadcast_i32x8 (__mmask16 k, __m256i a)
Synopsis
__m512i _mm512_maskz_broadcast_i32x8 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vbroadcasti32x8
CPUID Flags: AVX512DQ
Description
Broadcast the 8 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
n := (j mod 8)*32
IF k[j]
dst[i+31:i] := a[n+31:n]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti64x2
__m256i _mm256_broadcast_i64x2 (__m128i a)
Synopsis
__m256i _mm256_broadcast_i64x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the 2 packed 64-bit integers from a to all elements of dst.
Operation
FOR j := 0 to 3
i := j*64
n := (j mod 2)*64
dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:256] := 0
vbroadcasti64x2
__m256i _mm256_mask_broadcast_i64x2 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_broadcast_i64x2 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the 2 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
n := (j mod 2)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := src[n+63:n]
FI
ENDFOR
dst[MAX:256] := 0
vbroadcasti64x2
__m256i _mm256_maskz_broadcast_i64x2 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_broadcast_i64x2 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Broadcast the 2 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
n := (j mod 2)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vbroadcasti64x2
__m512i _mm512_broadcast_i64x2 (__m128i a)
Synopsis
__m512i _mm512_broadcast_i64x2 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512DQ
Description
Broadcast the 2 packed 64-bit integers from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 2)*64
dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0
vbroadcasti64x2
__m512i _mm512_mask_broadcast_i64x2 (__m512i src, __mmask8 k, __m128i a)
Synopsis
__m512i _mm512_mask_broadcast_i64x2 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512DQ
Description
Broadcast the 2 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 2)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := src[n+63:n]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti64x2
__m512i _mm512_maskz_broadcast_i64x2 (__mmask8 k, __m128i a)
Synopsis
__m512i _mm512_maskz_broadcast_i64x2 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vbroadcasti64x2
CPUID Flags: AVX512DQ
Description
Broadcast the 2 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 2)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti64x4
__m512i _mm512_broadcast_i64x4 (__m256i a)
Synopsis
__m512i _mm512_broadcast_i64x4 (__m256i a)
#include "immintrin.h"
Instruction: vbroadcasti64x4 zmm {k}, m256
CPUID Flags: AVX512F
Description
Broadcast the 4 packed 64-bit integers from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 4)*64
dst[i+63:i] := a[n+63:n]
ENDFOR
dst[MAX:512] := 0
vbroadcasti64x4
__m512i _mm512_mask_broadcast_i64x4 (__m512i src, __mmask8 k, __m256i a)
Synopsis
__m512i _mm512_mask_broadcast_i64x4 (__m512i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vbroadcasti64x4 zmm {k}, m256
CPUID Flags: AVX512F
Description
Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 4)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := src[n+63:n]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti64x4
__m512i _mm512_maskz_broadcast_i64x4 (__mmask8 k, __m256i a)
Synopsis
__m512i _mm512_maskz_broadcast_i64x4 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vbroadcasti64x4 zmm {k}, m256
CPUID Flags: AVX512F
Description
Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
n := (j mod 4)*64
IF k[j]
dst[i+63:i] := a[n+63:n]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastf128
__m256d _mm256_broadcast_pd (__m128d const * mem_addr)
Synopsis
__m256d _mm256_broadcast_pd (__m128d const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastf128 ymm, m128
CPUID Flags: AVX
Description
Broadcast 128 bits from memory (composed of 2 packed double-precision (64-bit) floating-point elements) to all elements of dst.
Operation
tmp[127:0] = MEM[mem_addr+127:mem_addr]
dst[127:0] := tmp[127:0]
dst[255:128] := tmp[127:0]
dst[MAX:256] := 0
Performance
vbroadcastf128
__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
Synopsis
__m256 _mm256_broadcast_ps (__m128 const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastf128 ymm, m128
CPUID Flags: AVX
Description
Broadcast 128 bits from memory (composed of 4 packed single-precision (32-bit) floating-point elements) to all elements of dst.
Operation
tmp[127:0] = MEM[mem_addr+127:mem_addr]
dst[127:0] := tmp[127:0]
dst[255:128] := tmp[127:0]
dst[MAX:256] := 0
Performance
vbroadcastsd
__m256d _mm256_broadcast_sd (double const * mem_addr)
Synopsis
__m256d _mm256_broadcast_sd (double const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastsd ymm, m64
CPUID Flags: AVX
Description
Broadcast a double-precision (64-bit) floating-point element from memory to all elements of dst.
Operation
tmp[63:0] = MEM[mem_addr+63:mem_addr]
FOR j := 0 to 3
i := j*64
dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:256] := 0
Performance
vbroadcastss
__m128 _mm_broadcast_ss (float const * mem_addr)
Synopsis
__m128 _mm_broadcast_ss (float const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastss xmm, m32
CPUID Flags: AVX
Description
Broadcast a single-precision (32-bit) floating-point element from memory to all elements of dst.
Operation
tmp[31:0] = MEM[mem_addr+31:mem_addr]
FOR j := 0 to 3
i := j*32
dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:128] := 0
vbroadcastss
__m256 _mm256_broadcast_ss (float const * mem_addr)
Synopsis
__m256 _mm256_broadcast_ss (float const * mem_addr)
#include "immintrin.h"
Instruction: vbroadcastss ymm, m32
CPUID Flags: AVX
Description
Broadcast a single-precision (32-bit) floating-point element from memory to all elements of dst.
Operation
tmp[31:0] = MEM[mem_addr+31:mem_addr]
FOR j := 0 to 7
i := j*32
dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:256] := 0
Performance
vpbroadcastb
__m128i _mm_broadcastb_epi8 (__m128i a)
Synopsis
__m128i _mm_broadcastb_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb xmm, xmm
CPUID Flags: AVX2
Description
Broadcast the low packed 8-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:128] := 0
Performance
vpbroadcastb
__m128i _mm_mask_broadcastb_epi8 (__m128i src, __mmask16 k, __m128i a)
Synopsis
__m128i _mm_mask_broadcastb_epi8 (__m128i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastb
__m128i _mm_maskz_broadcastb_epi8 (__mmask16 k, __m128i a)
Synopsis
__m128i _mm_maskz_broadcastb_epi8 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastb
__m256i _mm256_broadcastb_epi8 (__m128i a)
Synopsis
__m256i _mm256_broadcastb_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb ymm, xmm
CPUID Flags: AVX2
Description
Broadcast the low packed 8-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:256] := 0
Performance
vpbroadcastb
__m256i _mm256_mask_broadcastb_epi8 (__m256i src, __mmask32 k, __m128i a)
Synopsis
__m256i _mm256_mask_broadcastb_epi8 (__m256i src, __mmask32 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastb
__m256i _mm256_maskz_broadcastb_epi8 (__mmask32 k, __m128i a)
Synopsis
__m256i _mm256_maskz_broadcastb_epi8 (__mmask32 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastb
__m512i _mm512_broadcastb_epi8 (__m128i a)
Synopsis
__m512i _mm512_broadcastb_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW
Description
Broadcast the low packed 8-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:512] := 0
vpbroadcastb
__m512i _mm512_mask_broadcastb_epi8 (__m512i src, __mmask64 k, __m128i a)
Synopsis
__m512i _mm512_mask_broadcastb_epi8 (__m512i src, __mmask64 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW
Description
Broadcast the low packed 8-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastb
__m512i _mm512_maskz_broadcastb_epi8 (__mmask64 k, __m128i a)
Synopsis
__m512i _mm512_maskz_broadcastb_epi8 (__mmask64 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW
Description
Broadcast the low packed 8-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastd
__m128i _mm_broadcastd_epi32 (__m128i a)
Synopsis
__m128i _mm_broadcastd_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd xmm, xmm
CPUID Flags: AVX2
Description
Broadcast the low packed 32-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:128] := 0
Performance
vpbroadcastd
__m128i _mm_mask_broadcastd_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_broadcastd_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastd
__m128i _mm_maskz_broadcastd_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_broadcastd_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastd
__m256i _mm256_broadcastd_epi32 (__m128i a)
Synopsis
__m256i _mm256_broadcastd_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd ymm, xmm
CPUID Flags: AVX2
Description
Broadcast the low packed 32-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0
Performance
vpbroadcastd
__m256i _mm256_mask_broadcastd_epi32 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_broadcastd_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastd
__m256i _mm256_maskz_broadcastd_epi32 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_broadcastd_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastd
__m512i _mm512_broadcastd_epi32 (__m128i a)
Synopsis
__m512i _mm512_broadcastd_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low packed 32-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0
vpbroadcastd
__m512i _mm512_mask_broadcastd_epi32 (__m512i src, __mmask16 k, __m128i a)
Synopsis
__m512i _mm512_mask_broadcastd_epi32 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastd
__m512i _mm512_maskz_broadcastd_epi32 (__mmask16 k, __m128i a)
Synopsis
__m512i _mm512_maskz_broadcastd_epi32 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastmb2q
__m128i _mm_broadcastmb_epi64 (__mmask8 k)
Synopsis
__m128i _mm_broadcastmb_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpbroadcastmb2q
CPUID Flags: AVX512VL + AVX512CD
Description
Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ZeroExtend(k[7:0])
ENDFOR
dst[MAX:128] := 0
vpbroadcastmb2q
__m256i _mm256_broadcastmb_epi64 (__mmask8 k)
Synopsis
__m256i _mm256_broadcastmb_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpbroadcastmb2q
CPUID Flags: AVX512VL + AVX512CD
Description
Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ZeroExtend(k[7:0])
ENDFOR
dst[MAX:256] := 0
vpbroadcastmb2q
__m512i _mm512_broadcastmb_epi64 (__mmask8 k)
Synopsis
__m512i _mm512_broadcastmb_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpbroadcastmb2q zmm, k
CPUID Flags: AVX512CD
Description
Broadcast the low 8-bits from input mask k to all 64-bit elements of dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ZeroExtend(k[7:0])
ENDFOR
dst[MAX:512] := 0
vpbroadcastmw2d
__m128i _mm_broadcastmw_epi32 (__mmask16 k)
Synopsis
__m128i _mm_broadcastmw_epi32 (__mmask16 k)
#include "immintrin.h"
Instruction: vpbroadcastmw2d
CPUID Flags: AVX512VL + AVX512CD
Description
Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ZeroExtend(k[15:0])
ENDFOR
dst[MAX:128] := 0
vpbroadcastmw2d
__m256i _mm256_broadcastmw_epi32 (__mmask16 k)
Synopsis
__m256i _mm256_broadcastmw_epi32 (__mmask16 k)
#include "immintrin.h"
Instruction: vpbroadcastmw2d
CPUID Flags: AVX512VL + AVX512CD
Description
Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ZeroExtend(k[15:0])
ENDFOR
dst[MAX:256] := 0
vpbroadcastmw2d
__m512i _mm512_broadcastmw_epi32 (__mmask16 k)
Synopsis
__m512i _mm512_broadcastmw_epi32 (__mmask16 k)
#include "immintrin.h"
Instruction: vpbroadcastmw2d zmm, k
CPUID Flags: AVX512CD
Description
Broadcast the low 16-bits from input mask k to all 32-bit elements of dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ZeroExtend(k[15:0])
ENDFOR
dst[MAX:512] := 0
vpbroadcastq
__m128i _mm_broadcastq_epi64 (__m128i a)
Synopsis
__m128i _mm_broadcastq_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq xmm, xmm
CPUID Flags: AVX2
Description
Broadcast the low packed 64-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:128] := 0
Performance
vpbroadcastq
__m128i _mm_mask_broadcastq_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_broadcastq_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastq
__m128i _mm_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastq
__m256i _mm256_broadcastq_epi64 (__m128i a)
Synopsis
__m256i _mm256_broadcastq_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq ymm, xmm
CPUID Flags: AVX2
Description
Broadcast the low packed 64-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0
Performance
vpbroadcastq
__m256i _mm256_mask_broadcastq_epi64 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_broadcastq_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastq
__m256i _mm256_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastq
__m512i _mm512_broadcastq_epi64 (__m128i a)
Synopsis
__m512i _mm512_broadcastq_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low packed 64-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0
vpbroadcastq
__m512i _mm512_mask_broadcastq_epi64 (__m512i src, __mmask8 k, __m128i a)
Synopsis
__m512i _mm512_mask_broadcastq_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastq
__m512i _mm512_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)
Synopsis
__m512i _mm512_maskz_broadcastq_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
movddup
__m128d _mm_broadcastsd_pd (__m128d a)
Synopsis
__m128d _mm_broadcastsd_pd (__m128d a)
#include "immintrin.h"
Instruction: movddup xmm, xmm
CPUID Flags: AVX2
Description
Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:128] := 0
Performance
vbroadcastsd
__m256d _mm256_broadcastsd_pd (__m128d a)
Synopsis
__m256d _mm256_broadcastsd_pd (__m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd ymm, xmm
CPUID Flags: AVX2
Description
Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0
Performance
vbroadcastsd
__m256d _mm256_mask_broadcastsd_pd (__m256d src, __mmask8 k, __m128d a)
Synopsis
__m256d _mm256_mask_broadcastsd_pd (__m256d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastsd
__m256d _mm256_maskz_broadcastsd_pd (__mmask8 k, __m128d a)
Synopsis
__m256d _mm256_maskz_broadcastsd_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastsd
__m512d _mm512_broadcastsd_pd (__m128d a)
Synopsis
__m512d _mm512_broadcastsd_pd (__m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0
vbroadcastsd
__m512d _mm512_mask_broadcastsd_pd (__m512d src, __mmask8 k, __m128d a)
Synopsis
__m512d _mm512_mask_broadcastsd_pd (__m512d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastsd
__m512d _mm512_maskz_broadcastsd_pd (__mmask8 k, __m128d a)
Synopsis
__m512d _mm512_maskz_broadcastsd_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vbroadcastsd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vbroadcasti128
__m256i _mm256_broadcastsi128_si256 (__m128i a)
Synopsis
__m256i _mm256_broadcastsi128_si256 (__m128i a)
#include "immintrin.h"
Instruction: vbroadcasti128 ymm, m128
CPUID Flags: AVX2
Description
Broadcast 128 bits of integer data from a to all 128-bit lanes in dst.
Operation
dst[127:0] := a[127:0]
dst[255:128] := a[127:0]
dst[MAX:256] := 0
vbroadcastss
__m128 _mm_broadcastss_ps (__m128 a)
Synopsis
__m128 _mm_broadcastss_ps (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastss xmm, xmm
CPUID Flags: AVX2
Description
Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:128] := 0
Performance
vbroadcastss
__m128 _mm_mask_broadcastss_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_broadcastss_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vbroadcastss
__m128 _mm_maskz_broadcastss_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_broadcastss_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vbroadcastss
__m256 _mm256_broadcastss_ps (__m128 a)
Synopsis
__m256 _mm256_broadcastss_ps (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastss ymm, xmm
CPUID Flags: AVX2
Description
Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0
Performance
vbroadcastss
__m256 _mm256_mask_broadcastss_ps (__m256 src, __mmask8 k, __m128 a)
Synopsis
__m256 _mm256_mask_broadcastss_ps (__m256 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastss
__m256 _mm256_maskz_broadcastss_ps (__mmask8 k, __m128 a)
Synopsis
__m256 _mm256_maskz_broadcastss_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vbroadcastss
__m512 _mm512_broadcastss_ps (__m128 a)
Synopsis
__m512 _mm512_broadcastss_ps (__m128 a)
#include "immintrin.h"
Instruction: vbroadcastss zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0
vbroadcastss
__m512 _mm512_mask_broadcastss_ps (__m512 src, __mmask16 k, __m128 a)
Synopsis
__m512 _mm512_mask_broadcastss_ps (__m512 src, __mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vbroadcastss
__m512 _mm512_maskz_broadcastss_ps (__mmask16 k, __m128 a)
Synopsis
__m512 _mm512_maskz_broadcastss_ps (__mmask16 k, __m128 a)
#include "immintrin.h"
Instruction: vbroadcastss zmm {k}, xmm
CPUID Flags: AVX512F
Description
Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastw
__m128i _mm_broadcastw_epi16 (__m128i a)
Synopsis
__m128i _mm_broadcastw_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw xmm, xmm
CPUID Flags: AVX2
Description
Broadcast the low packed 16-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:128] := 0
Performance
vpbroadcastw
__m128i _mm_mask_broadcastw_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_broadcastw_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastw
__m128i _mm_maskz_broadcastw_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_broadcastw_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastw
__m256i _mm256_broadcastw_epi16 (__m128i a)
Synopsis
__m256i _mm256_broadcastw_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw ymm, xmm
CPUID Flags: AVX2
Description
Broadcast the low packed 16-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:256] := 0
Performance
vpbroadcastw
__m256i _mm256_mask_broadcastw_epi16 (__m256i src, __mmask16 k, __m128i a)
Synopsis
__m256i _mm256_mask_broadcastw_epi16 (__m256i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastw
__m256i _mm256_maskz_broadcastw_epi16 (__mmask16 k, __m128i a)
Synopsis
__m256i _mm256_maskz_broadcastw_epi16 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastw
__m512i _mm512_broadcastw_epi16 (__m128i a)
Synopsis
__m512i _mm512_broadcastw_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:512] := 0
vpbroadcastw
__m512i _mm512_mask_broadcastw_epi16 (__m512i src, __mmask32 k, __m128i a)
Synopsis
__m512i _mm512_mask_broadcastw_epi16 (__m512i src, __mmask32 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastw
__m512i _mm512_maskz_broadcastw_epi16 (__mmask32 k, __m128i a)
Synopsis
__m512i _mm512_maskz_broadcastw_epi16 (__mmask32 k, __m128i a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpslldq
__m256i _mm256_bslli_epi128 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_bslli_epi128 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpslldq ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
dst[255:128] := a[255:128] << (tmp*8)
dst[MAX:256] := 0
Performance
vpslldq
__m512i _mm512_bslli_epi128 (__m512i a, int imm8)
Synopsis
__m512i _mm512_bslli_epi128 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vpslldq
CPUID Flags: AVX512BW
Description
Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
dst[255:128] := a[255:128] << (tmp*8)
dst[383:256] := a[383:256] << (tmp*8)
dst[511:384] := a[511:384] << (tmp*8)
dst[MAX:512] := 0
pslldq
__m128i _mm_bslli_si128 (__m128i a, int imm8)
Synopsis
__m128i _mm_bslli_si128 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pslldq xmm, imm
CPUID Flags: SSE2
Description
Shift a left by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
Performance
vpsrldq
__m256i _mm256_bsrli_epi128 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_bsrli_epi128 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpsrldq ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] >> (tmp*8)
dst[255:128] := a[255:128] >> (tmp*8)
dst[MAX:256] := 0
Performance
vpsrldq
__m512i _mm512_bsrli_epi128 (__m512i a, int imm8)
Synopsis
__m512i _mm512_bsrli_epi128 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vpsrldq
CPUID Flags: AVX512BW
Description
Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] >> (tmp*8)
dst[255:128] := a[255:128] >> (tmp*8)
dst[383:256] := a[383:256] >> (tmp*8)
dst[511:384] := a[511:384] >> (tmp*8)
dst[MAX:512] := 0
psrldq
__m128i _mm_bsrli_si128 (__m128i a, int imm8)
Synopsis
__m128i _mm_bsrli_si128 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrldq xmm, imm
CPUID Flags: SSE2
Description
Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] >> (tmp*8)
Performance
bswap
int _bswap (int a)
Synopsis
int _bswap (int a)
#include "immintrin.h"
Instruction: bswap r32
Description
Reverse the byte order of 32-bit integer a, and store the result in dst. This intrinsic is provided for conversion between little and big endian values.
Operation
dst[7:0] := a[31:24]
dst[15:8] := a[23:16]
dst[23:16] := a[15:8]
dst[31:24] := a[7:0]
Performance
bswap
__int64 _bswap64 (__int64 a)
Synopsis
__int64 _bswap64 (__int64 a)
#include "immintrin.h"
Instruction: bswap r64
Description
Reverse the byte order of 64-bit integer a, and store the result in dst. This intrinsic is provided for conversion between little and big endian values.
Operation
dst[7:0] := a[63:56]
dst[15:8] := a[55:48]
dst[23:16] := a[47:40]
dst[31:24] := a[39:32]
dst[39:32] := a[31:24]
dst[47:40] := a[23:16]
dst[55:48] := a[15:8]
dst[63:56] := a[7:0]
Performance
bzhi
unsigned int _bzhi_u32 (unsigned int a, unsigned int index)
Synopsis
unsigned int _bzhi_u32 (unsigned int a, unsigned int index)
#include "immintrin.h"
Instruction: bzhi r32, r32, r32
CPUID Flags: BMI2
Description
Copy all bits from unsigned 32-bit integer a to dst, and reset (set to 0) the high bits in dst starting at index.
Operation
n := index[7:0]
dst := a
IF (n < 32)
dst[31:n] := 0
FI
Performance
bzhi
unsigned __int64 _bzhi_u64 (unsigned __int64 a, unsigned int index)
Synopsis
unsigned __int64 _bzhi_u64 (unsigned __int64 a, unsigned int index)
#include "immintrin.h"
Instruction: bzhi r64, r64, r64
CPUID Flags: BMI2
Description
Copy all bits from unsigned 64-bit integer a to dst, and reset (set to 0) the high bits in dst starting at index.
Operation
n := index[7:0]
dst := a
IF (n < 64)
dst[63:n] := 0
FI
Performance
unsigned __int32 _castf32_u32 (float a)
Synopsis
unsigned __int32 _castf32_u32 (float a)
#include "immintrin.h"
Description
Cast from type float to type unsigned __int32 without conversion.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
unsigned __int64 _castf64_u64 (double a)
Synopsis
unsigned __int64 _castf64_u64 (double a)
#include "immintrin.h"
Description
Cast from type double to type unsigned __int64 without conversion.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128 _mm_castpd_ps (__m128d a)
Synopsis
__m128 _mm_castpd_ps (__m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Cast vector of type __m128d to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256 _mm256_castpd_ps (__m256d a)
Synopsis
__m256 _mm256_castpd_ps (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Cast vector of type __m256d to type __m256.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512 _mm512_castpd_ps (__m512d a)
Synopsis
__m512 _mm512_castpd_ps (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Cast vector of type __m512d to type __m512.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128i _mm_castpd_si128 (__m128d a)
Synopsis
__m128i _mm_castpd_si128 (__m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Cast vector of type __m128d to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256i _mm256_castpd_si256 (__m256d a)
Synopsis
__m256i _mm256_castpd_si256 (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m256d to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512i _mm512_castpd_si512 (__m512d a)
Synopsis
__m512i _mm512_castpd_si512 (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Cast vector of type __m512d to type __m512i.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256d _mm256_castpd128_pd256 (__m128d a)
Synopsis
__m256d _mm256_castpd128_pd256 (__m128d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m128d to type __m256d; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512d _mm512_castpd128_pd512 (__m128d a)
Synopsis
__m512d _mm512_castpd128_pd512 (__m128d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128d _mm256_castpd256_pd128 (__m256d a)
Synopsis
__m128d _mm256_castpd256_pd128 (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m256d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512d _mm512_castpd256_pd512 (__m256d a)
Synopsis
__m512d _mm512_castpd256_pd512 (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128d _mm512_castpd512_pd128 (__m512d a)
Synopsis
__m128d _mm512_castpd512_pd128 (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m512d to type __m128d.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256d _mm512_castpd512_pd256 (__m512d a)
Synopsis
__m256d _mm512_castpd512_pd256 (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m512d to type __m256d.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128d _mm_castps_pd (__m128 a)
Synopsis
__m128d _mm_castps_pd (__m128 a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Cast vector of type __m128 to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256d _mm256_castps_pd (__m256 a)
Synopsis
__m256d _mm256_castps_pd (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Cast vector of type __m256 to type __m256d.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512d _mm512_castps_pd (__m512 a)
Synopsis
__m512d _mm512_castps_pd (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Cast vector of type __m512 to type __m512d.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128i _mm_castps_si128 (__m128 a)
Synopsis
__m128i _mm_castps_si128 (__m128 a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Cast vector of type __m128 to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256i _mm256_castps_si256 (__m256 a)
Synopsis
__m256i _mm256_castps_si256 (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m256 to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512i _mm512_castps_si512 (__m512 a)
Synopsis
__m512i _mm512_castps_si512 (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Cast vector of type __m512 to type __m512i.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256 _mm256_castps128_ps256 (__m128 a)
Synopsis
__m256 _mm256_castps128_ps256 (__m128 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m128 to type __m256; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512 _mm512_castps128_ps512 (__m128 a)
Synopsis
__m512 _mm512_castps128_ps512 (__m128 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128 _mm256_castps256_ps128 (__m256 a)
Synopsis
__m128 _mm256_castps256_ps128 (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m256 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512 _mm512_castps256_ps512 (__m256 a)
Synopsis
__m512 _mm512_castps256_ps512 (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128 _mm512_castps512_ps128 (__m512 a)
Synopsis
__m128 _mm512_castps512_ps128 (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m512 to type __m128.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256 _mm512_castps512_ps256 (__m512 a)
Synopsis
__m256 _mm512_castps512_ps256 (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m512 to type __m256.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128d _mm_castsi128_pd (__m128i a)
Synopsis
__m128d _mm_castsi128_pd (__m128i a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Cast vector of type __m128i to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128 _mm_castsi128_ps (__m128i a)
Synopsis
__m128 _mm_castsi128_ps (__m128i a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Cast vector of type __m128i to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256i _mm256_castsi128_si256 (__m128i a)
Synopsis
__m256i _mm256_castsi128_si256 (__m128i a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m128i to type __m256i; the upper 128 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512i _mm512_castsi128_si512 (__m128i a)
Synopsis
__m512i _mm512_castsi128_si512 (__m128i a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256d _mm256_castsi256_pd (__m256i a)
Synopsis
__m256d _mm256_castsi256_pd (__m256i a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m256i to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256 _mm256_castsi256_ps (__m256i a)
Synopsis
__m256 _mm256_castsi256_ps (__m256i a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m256i to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128i _mm256_castsi256_si128 (__m256i a)
Synopsis
__m128i _mm256_castsi256_si128 (__m256i a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Casts vector of type __m256i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512i _mm512_castsi256_si512 (__m256i a)
Synopsis
__m512i _mm512_castsi256_si512 (__m256i a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512d _mm512_castsi512_pd (__m512i a)
Synopsis
__m512d _mm512_castsi512_pd (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Cast vector of type __m512i to type __m512d.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m512 _mm512_castsi512_ps (__m512i a)
Synopsis
__m512 _mm512_castsi512_ps (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Cast vector of type __m512i to type __m512.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m128i _mm512_castsi512_si128 (__m512i a)
Synopsis
__m128i _mm512_castsi512_si128 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m512i to type __m128i.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
__m256i _mm512_castsi512_si256 (__m512i a)
Synopsis
__m256i _mm512_castsi512_si256 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Cast vector of type __m512i to type __m256i.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
float _castu32_f32 (unsigned __int32 a)
Synopsis
float _castu32_f32 (unsigned __int32 a)
#include "immintrin.h"
Description
Cast from type unsigned __int32 to type float without conversion.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
double _castu64_f64 (unsigned __int64 a)
Synopsis
double _castu64_f64 (unsigned __int64 a)
#include "immintrin.h"
Description
Cast from type unsigned __int64 to type double without conversion.
This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
...
__m128d _mm_cbrt_pd (__m128d a)
Synopsis
__m128d _mm_cbrt_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := CubeRoot(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_cbrt_pd (__m256d a)
Synopsis
__m256d _mm256_cbrt_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := CubeRoot(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_cbrt_pd (__m512d a)
Synopsis
__m512d _mm512_cbrt_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := CubeRoot(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_cbrt_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_cbrt_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := CubeRoot(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_cbrt_ps (__m128 a)
Synopsis
__m128 _mm_cbrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := CubeRoot(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_cbrt_ps (__m256 a)
Synopsis
__m256 _mm256_cbrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := CubeRoot(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_cbrt_ps (__m512 a)
Synopsis
__m512 _mm512_cbrt_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := CubeRoot(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_cbrt_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_cbrt_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := CubeRoot(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_cdfnorm_pd (__m128d a)
Synopsis
__m128d _mm_cdfnorm_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := CDFNormal(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_cdfnorm_pd (__m256d a)
Synopsis
__m256d _mm256_cdfnorm_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := CDFNormal(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_cdfnorm_pd (__m512d a)
Synopsis
__m512d _mm512_cdfnorm_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := CDFNormal(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_cdfnorm_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_cdfnorm_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := CDFNormal(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_cdfnorm_ps (__m128 a)
Synopsis
__m128 _mm_cdfnorm_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := CDFNormal(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_cdfnorm_ps (__m256 a)
Synopsis
__m256 _mm256_cdfnorm_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := CDFNormal(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_cdfnorm_ps (__m512 a)
Synopsis
__m512 _mm512_cdfnorm_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := CDFNormal(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_cdfnorm_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_cdfnorm_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := CDFNormal(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_cdfnorminv_pd (__m128d a)
Synopsis
__m128d _mm_cdfnorminv_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := InverseCDFNormal(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_cdfnorminv_pd (__m256d a)
Synopsis
__m256d _mm256_cdfnorminv_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := InverseCDFNormal(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_cdfnorminv_pd (__m512d a)
Synopsis
__m512d _mm512_cdfnorminv_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := InverseCDFNormal(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_cdfnorminv_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_cdfnorminv_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in a using the normal distribution, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := InverseCDFNormal(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_cdfnorminv_ps (__m128 a)
Synopsis
__m128 _mm_cdfnorminv_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := InverseCDFNormal(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_cdfnorminv_ps (__m256 a)
Synopsis
__m256 _mm256_cdfnorminv_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := InverseCDFNormal(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_cdfnorminv_ps (__m512 a)
Synopsis
__m512 _mm512_cdfnorminv_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := InverseCDFNormal(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_cdfnorminv_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_cdfnorminv_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in a using the normal distribution, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := InverseCDFNormal(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
roundpd
__m128d _mm_ceil_pd (__m128d a)
Synopsis
__m128d _mm_ceil_pd (__m128d a)
#include "smmintrin.h"
Instruction: roundpd xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
Performance
vroundpd
__m256d _mm256_ceil_pd (__m256d a)
Synopsis
__m256d _mm256_ceil_pd (__m256d a)
#include "immintrin.h"
Instruction: vroundpd ymm, ymm, imm
CPUID Flags: AVX
Description
Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
...
__m512d _mm512_ceil_pd (__m512d a)
Synopsis
__m512d _mm512_ceil_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_ceil_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_ceil_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := CEIL(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
roundps
__m128 _mm_ceil_ps (__m128 a)
Synopsis
__m128 _mm_ceil_ps (__m128 a)
#include "smmintrin.h"
Instruction: roundps xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
Performance
vroundps
__m256 _mm256_ceil_ps (__m256 a)
Synopsis
__m256 _mm256_ceil_ps (__m256 a)
#include "immintrin.h"
Instruction: vroundps ymm, ymm, imm
CPUID Flags: AVX
Description
Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
...
__m512 _mm512_ceil_ps (__m512 a)
Synopsis
__m512 _mm512_ceil_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_ceil_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_ceil_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := CEIL(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
roundsd
__m128d _mm_ceil_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_ceil_sd (__m128d a, __m128d b)
#include "smmintrin.h"
Instruction: roundsd xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the lower double-precision (64-bit) floating-point element in b up to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := CEIL(b[63:0])
dst[127:64] := a[127:64]
Performance
roundss
__m128 _mm_ceil_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_ceil_ss (__m128 a, __m128 b)
#include "smmintrin.h"
Instruction: roundss xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the lower single-precision (32-bit) floating-point element in b up to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := CEIL(b[31:0])
dst[127:32] := a[127:32]
Performance
...
__m128 _mm_cexp_ps (__m128 a)
Synopsis
__m128 _mm_cexp_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of e raised to the power of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_cexp_ps (__m256 a)
Synopsis
__m256 _mm256_cexp_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of e raised to the power of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
clevict0, clevict1
void _mm_clevict (const void * ptr, int level)
Synopsis
void _mm_clevict (const void * ptr, int level)
#include "immintrin.h"
Instruction: clevict0 m
clevict1 m
CPUID Flags: KNCNI
Description
Evicts the cache line containing the address ptr from cache level level (can be either 0 or 1).
Operation
CacheLineEvict(ptr, level)
clflush
void _mm_clflush (void const* p)
Synopsis
void _mm_clflush (void const* p)
#include "emmintrin.h"
Instruction: clflush mprefetch
CPUID Flags: SSE2
Description
Invalidate and flush the cache line that contains p from all levels of the cache hierarchy.
Performance
clflushopt
void _mm_clflushopt (void const * p)
Synopsis
void _mm_clflushopt (void const * p)
#include "immintrin.h"
Instruction: clflushopt
CPUID Flags: CLFLUSHOPT
Description
Invalidate and flush the cache line that contains p from all levels of the cache hierarchy.
pclmulqdq
__m128i _mm_clmulepi64_si128 (__m128i a, __m128i b, const int imm8)
Synopsis
__m128i _mm_clmulepi64_si128 (__m128i a, __m128i b, const int imm8)
#include "wmmintrin.h"
Instruction: pclmulqdq xmm, xmm, imm
CPUID Flags: PCLMULQDQ
Description
Perform a carry-less multiplication of two 64-bit integers, selected from a and b according to imm8, and store the results in dst.
Operation
IF (imm8[0] = 0)
TEMP1 := a[63:0];
ELSE
TEMP1 := a[127:64];
FI
IF (imm8[4] = 0)
TEMP2 := b[63:0];
ELSE
TEMP2 := b[127:64];
FI
FOR i := 0 to 63
TEMP[i] := (TEMP1[0] and TEMP2[i]);
FOR j := 1 to i
TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j])
ENDFOR
dst[i] := TEMP[i];
ENDFOR
FOR i := 64 to 127
TEMP [i] := 0;
FOR j := (i - 63) to 63
TEMP [i] := TEMP [i] XOR (TEMP1[j] AND TEMP2[i-j])
ENDFOR
dst[i] := TEMP[i];
ENDFOR
dst[127] := 0
Performance
...
__m128 _mm_clog_ps (__m128 a)
Synopsis
__m128 _mm_clog_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the natural logarithm of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_clog_ps (__m256 a)
Synopsis
__m256 _mm256_clog_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the natural logarithm of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vpcmpw
__mmask8 _mm_cmp_epi16_mask (__m128i a, __m128i b, const int imm8)
Synopsis
__mmask8 _mm_cmp_epi16_mask (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmp_epi16_mask (__mmask8 k1, __m128i a, __m128i b, const int imm8)
Synopsis
__mmask8 _mm_mask_cmp_epi16_mask (__mmask8 k1, __m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmp_epi16_mask (__m256i a, __m256i b, const int imm8)
Synopsis
__mmask16 _mm256_cmp_epi16_mask (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmp_epi16_mask (__mmask16 k1, __m256i a, __m256i b, const int imm8)
Synopsis
__mmask16 _mm256_mask_cmp_epi16_mask (__mmask16 k1, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmp_epi16_mask (__m512i a, __m512i b, const int imm8)
Synopsis
__mmask32 _mm512_cmp_epi16_mask (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmp_epi16_mask (__mmask32 k1, __m512i a, __m512i b, const int imm8)
Synopsis
__mmask32 _mm512_mask_cmp_epi16_mask (__mmask32 k1, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpd
__mmask8 _mm_cmp_epi32_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm_cmp_epi32_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmp_epi32_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm_mask_cmp_epi32_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmp_epi32_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm256_cmp_epi32_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmp_epi32_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm256_mask_cmp_epi32_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask16 _mm512_cmp_epi32_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask16 _mm512_cmp_epi32_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmp_epi32_mask (__mmask16 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask16 _mm512_mask_cmp_epi32_mask (__mmask16 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmp_epi64_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm_cmp_epi64_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmp_epi64_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm_mask_cmp_epi64_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmp_epi64_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm256_cmp_epi64_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmp_epi64_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm256_mask_cmp_epi64_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmp_epi64_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm512_cmp_epi64_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmp_epi64_mask (__mmask8 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm512_mask_cmp_epi64_mask (__mmask8 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpb
__mmask16 _mm_cmp_epi8_mask (__m128i a, __m128i b, const int imm8)
Synopsis
__mmask16 _mm_cmp_epi8_mask (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmp_epi8_mask (__mmask16 k1, __m128i a, __m128i b, const int imm8)
Synopsis
__mmask16 _mm_mask_cmp_epi8_mask (__mmask16 k1, __m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmp_epi8_mask (__m256i a, __m256i b, const int imm8)
Synopsis
__mmask32 _mm256_cmp_epi8_mask (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmp_epi8_mask (__mmask32 k1, __m256i a, __m256i b, const int imm8)
Synopsis
__mmask32 _mm256_mask_cmp_epi8_mask (__mmask32 k1, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmp_epi8_mask (__m512i a, __m512i b, const int imm8)
Synopsis
__mmask64 _mm512_cmp_epi8_mask (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmp_epi8_mask (__mmask64 k1, __m512i a, __m512i b, const int imm8)
Synopsis
__mmask64 _mm512_mask_cmp_epi8_mask (__mmask64 k1, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmp_epu16_mask (__m128i a, __m128i b, const int imm8)
Synopsis
__mmask8 _mm_cmp_epu16_mask (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmp_epu16_mask (__mmask8 k1, __m128i a, __m128i b, const int imm8)
Synopsis
__mmask8 _mm_mask_cmp_epu16_mask (__mmask8 k1, __m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmp_epu16_mask (__m256i a, __m256i b, const int imm8)
Synopsis
__mmask16 _mm256_cmp_epu16_mask (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmp_epu16_mask (__mmask16 k1, __m256i a, __m256i b, const int imm8)
Synopsis
__mmask16 _mm256_mask_cmp_epu16_mask (__mmask16 k1, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmp_epu16_mask (__m512i a, __m512i b, const int imm8)
Synopsis
__mmask32 _mm512_cmp_epu16_mask (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmp_epu16_mask (__mmask32 k1, __m512i a, __m512i b, const int imm8)
Synopsis
__mmask32 _mm512_mask_cmp_epu16_mask (__mmask32 k1, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmp_epu32_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm_cmp_epu32_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmp_epu32_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm_mask_cmp_epu32_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmp_epu32_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm256_cmp_epu32_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmp_epu32_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm256_mask_cmp_epu32_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmp_epu32_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask16 _mm512_cmp_epu32_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmp_epu32_mask (__mmask16 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask16 _mm512_mask_cmp_epu32_mask (__mmask16 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmp_epu64_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm_cmp_epu64_mask (__m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmp_epu64_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm_mask_cmp_epu64_mask (__mmask8 k1, __m128i a, __m128i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmp_epu64_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm256_cmp_epu64_mask (__m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmp_epu64_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm256_mask_cmp_epu64_mask (__mmask8 k1, __m256i a, __m256i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmp_epu64_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm512_cmp_epu64_mask (__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmp_epu64_mask (__mmask8 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
Synopsis
__mmask8 _mm512_mask_cmp_epu64_mask (__mmask8 k1, __m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmp_epu8_mask (__m128i a, __m128i b, const int imm8)
Synopsis
__mmask16 _mm_cmp_epu8_mask (__m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmp_epu8_mask (__mmask16 k1, __m128i a, __m128i b, const int imm8)
Synopsis
__mmask16 _mm_mask_cmp_epu8_mask (__mmask16 k1, __m128i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmp_epu8_mask (__m256i a, __m256i b, const int imm8)
Synopsis
__mmask32 _mm256_cmp_epu8_mask (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmp_epu8_mask (__mmask32 k1, __m256i a, __m256i b, const int imm8)
Synopsis
__mmask32 _mm256_mask_cmp_epu8_mask (__mmask32 k1, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmp_epu8_mask (__m512i a, __m512i b, const int imm8)
Synopsis
__mmask64 _mm512_cmp_epu8_mask (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmp_epu8_mask (__mmask64 k1, __m512i a, __m512i b, const int imm8)
Synopsis
__mmask64 _mm512_mask_cmp_epu8_mask (__mmask64 k1, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vcmppd
__m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8)
Synopsis
__m128d _mm_cmp_pd (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd xmm, xmm, xmm, imm
CPUID Flags: AVX
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:128] := 0
Performance
vcmppd
__m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8)
Synopsis
__m256d _mm256_cmp_pd (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ( a[i+63:i] OP b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vcmppd
__mmask8 _mm_cmp_pd_mask (__m128d a, __m128d b, const int imm8)
Synopsis
__mmask8 _mm_cmp_pd_mask (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vcmppd
__mmask8 _mm_mask_cmp_pd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8)
Synopsis
__mmask8 _mm_mask_cmp_pd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vcmppd
__mmask8 _mm256_cmp_pd_mask (__m256d a, __m256d b, const int imm8)
Synopsis
__mmask8 _mm256_cmp_pd_mask (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
i := j*64
k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vcmppd
__mmask8 _mm256_mask_cmp_pd_mask (__mmask8 k1, __m256d a, __m256d b, const int imm8)
Synopsis
__mmask8 _mm256_mask_cmp_pd_mask (__mmask8 k1, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vcmppd
__mmask8 _mm512_cmp_pd_mask (__m512d a, __m512d b, const int imm8)
Synopsis
__mmask8 _mm512_cmp_pd_mask (__m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
i := j*64
k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmp_pd_mask (__mmask8 k1, __m512d a, __m512d b, const int imm8)
Synopsis
__mmask8 _mm512_mask_cmp_pd_mask (__mmask8 k1, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vcmpps
__m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_cmp_ps (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps xmm, xmm, xmm, imm
CPUID Flags: AVX
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:128] := 0
Performance
vcmpps
__m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8)
Synopsis
__m256 _mm256_cmp_ps (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in dst.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ( a[i+31:i] OP b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vcmpps
__mmask8 _mm_cmp_ps_mask (__m128 a, __m128 b, const int imm8)
Synopsis
__mmask8 _mm_cmp_ps_mask (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vcmpps
__mmask8 _mm_mask_cmp_ps_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8)
Synopsis
__mmask8 _mm_mask_cmp_ps_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vcmpps
__mmask8 _mm256_cmp_ps_mask (__m256 a, __m256 b, const int imm8)
Synopsis
__mmask8 _mm256_cmp_ps_mask (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
i := j*32
k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmpps
__mmask8 _mm256_mask_cmp_ps_mask (__mmask8 k1, __m256 a, __m256 b, const int imm8)
Synopsis
__mmask8 _mm256_mask_cmp_ps_mask (__mmask8 k1, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vcmpps
__mmask16 _mm512_cmp_ps_mask (__m512 a, __m512 b, const int imm8)
Synopsis
__mmask16 _mm512_cmp_ps_mask (__m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 15
i := j*32
k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmp_ps_mask (__mmask16 k1, __m512 a, __m512 b, const int imm8)
Synopsis
__mmask16 _mm512_mask_cmp_ps_mask (__mmask16 k1, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vcmppd
__mmask8 _mm512_cmp_round_pd_mask (__m512d a, __m512d b, const int imm8, const int sae)
Synopsis
__mmask8 _mm512_cmp_round_pd_mask (__m512d a, __m512d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm {sae}, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
i := j*64
k[j] := (a[i+63:i] OP b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmp_round_pd_mask (__mmask8 k1, __m512d a, __m512d b, const int imm8, const int sae)
Synopsis
__mmask8 _mm512_mask_cmp_round_pd_mask (__mmask8 k1, __m512d a, __m512d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm {sae}, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] OP b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vcmpps
__mmask16 _mm512_cmp_round_ps_mask (__m512 a, __m512 b, const int imm8, const int sae)
Synopsis
__mmask16 _mm512_cmp_round_ps_mask (__m512 a, __m512 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm {sae}, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 15
i := j*32
k[j] := (a[i+31:i] OP b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmp_round_ps_mask (__mmask16 k1, __m512 a, __m512 b, const int imm8, const int sae)
Synopsis
__mmask16 _mm512_mask_cmp_round_ps_mask (__mmask16 k1, __m512 a, __m512 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm {sae}, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] OP b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vcmpsd
__mmask8 _mm_cmp_round_sd_mask (__m128d a, __m128d b, const int imm8, const int sae)
Synopsis
__mmask8 _mm_cmp_round_sd_mask (__m128d a, __m128d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpsd k {k}, xmm, xmm {sae}, imm
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
k[MAX:1] := 0
vcmpsd
__mmask8 _mm_mask_cmp_round_sd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8, const int sae)
Synopsis
__mmask8 _mm_mask_cmp_round_sd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpsd k {k}, xmm, xmm {sae}, imm
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
IF k1[0]
k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
ELSE
k[0] := 0
FI
k[MAX:1] := 0
vcmpss
__mmask8 _mm_cmp_round_ss_mask (__m128 a, __m128 b, const int imm8, const int sae)
Synopsis
__mmask8 _mm_cmp_round_ss_mask (__m128 a, __m128 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpss k {k}, xmm, xmm {sae}, imm
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
k[MAX:1] := 0
vcmpss
__mmask8 _mm_mask_cmp_round_ss_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8, const int sae)
Synopsis
__mmask8 _mm_mask_cmp_round_ss_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcmpss k {k}, xmm, xmm {sae}, imm
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
IF k1[0]
k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
ELSE
k[0] := 0
FI
k[MAX:1] := 0
vcmpsd
__m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8)
Synopsis
__m128d _mm_cmp_sd (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmpsd xmm, xmm, xmm, imm
CPUID Flags: AVX
Description
Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
dst[63:0] := ( a[63:0] OP b[63:0] ) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
vcmpsd
__mmask8 _mm_cmp_sd_mask (__m128d a, __m128d b, const int imm8)
Synopsis
__mmask8 _mm_cmp_sd_mask (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmpsd k {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
k[MAX:1] := 0
vcmpsd
__mmask8 _mm_mask_cmp_sd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8)
Synopsis
__mmask8 _mm_mask_cmp_sd_mask (__mmask8 k1, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vcmpsd k {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
IF k1[0]
k[0] := ( a[63:0] OP b[63:0] ) ? 1 : 0
ELSE
k[0] := 0
FI
k[MAX:1] := 0
vcmpss
__m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_cmp_ss (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpss xmm, xmm, xmm, imm
CPUID Flags: AVX
Description
Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
dst[31:0] := ( a[31:0] OP b[31:0] ) ? 0xFFFFFFFF : 0
dst[127:32] := a[127:32]
dst[MAX:128] := 0
Performance
vcmpss
__mmask8 _mm_cmp_ss_mask (__m128 a, __m128 b, const int imm8)
Synopsis
__mmask8 _mm_cmp_ss_mask (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpss k {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
k[MAX:1] := 0
vcmpss
__mmask8 _mm_mask_cmp_ss_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8)
Synopsis
__mmask8 _mm_mask_cmp_ss_mask (__mmask8 k1, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vcmpss k {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
IF k1[0]
k[0] := ( a[31:0] OP b[31:0] ) ? 1 : 0
ELSE
k[0] := 0
FI
k[MAX:1] := 0
pcmpeqw
__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmpeq_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpeqw xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 16-bit integers in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
Performance
vpcmpeqw
__m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_cmpeq_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpeqw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 16-bit integers in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vpcmpw
__mmask8 _mm_cmpeq_epi16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpeq_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmpeq_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpeq_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmpeq_epi16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmpeq_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmpeq_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmpeq_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmpeq_epi16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmpeq_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmpeq_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmpeq_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
pcmpeqd
__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmpeq_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpeqd xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 32-bit integers in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
Performance
vpcmpeqd
__m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpeqd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 32-bit integers in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vpcmpd
__mmask8 _mm_cmpeq_epi32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpeq_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmpeq_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpeq_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmpeq_epi32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpeq_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmpeq_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpeq_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpeqd
__mmask16 _mm512_cmpeq_epi32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmpeq_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpeqd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpeqd
__mmask16 _mm512_mask_cmpeq_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmpeq_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpeqd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
pcmpeqq
__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmpeq_epi64 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pcmpeqq xmm, xmm
CPUID Flags: SSE4.1
Description
Compare packed 64-bit integers in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vpcmpeqq
__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_cmpeq_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpeqq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 64-bit integers in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vpcmpq
__mmask8 _mm_cmpeq_epi64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpeq_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmpeq_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpeq_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmpeq_epi64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpeq_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmpeq_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpeq_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpeqq
__mmask8 _mm512_cmpeq_epi64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmpeq_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpeqq k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpeqq
__mmask8 _mm512_mask_cmpeq_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmpeq_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpeqq k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
pcmpeqb
__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmpeq_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpeqb xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 8-bit integers in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
ENDFOR
Performance
vpcmpeqb
__m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_cmpeq_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpeqb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 8-bit integers in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vpcmpb
__mmask16 _mm_cmpeq_epi8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmpeq_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmpeq_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmpeq_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmpeq_epi8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmpeq_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmpeq_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmpeq_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmpeq_epi8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmpeq_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmpeq_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmpeq_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmpeq_epu16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpeq_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmpeq_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpeq_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmpeq_epu16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmpeq_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmpeq_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmpeq_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmpeq_epu16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmpeq_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmpeq_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmpeq_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmpeq_epu32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpeq_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmpeq_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpeq_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmpeq_epu32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpeq_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmpeq_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpeq_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmpeq_epu32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmpeq_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmpeq_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmpeq_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] == b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmpeq_epu64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpeq_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmpeq_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpeq_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmpeq_epu64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpeq_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmpeq_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpeq_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmpeq_epu64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmpeq_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmpeq_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmpeq_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] == b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmpeq_epu8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmpeq_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmpeq_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmpeq_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmpeq_epu8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmpeq_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmpeq_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmpeq_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmpeq_epu8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmpeq_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmpeq_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmpeq_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
cmppd
__m128d _mm_cmpeq_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpeq_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] == b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vcmppd
__mmask8 _mm512_cmpeq_pd_mask (__m512d a, __m512d b)
Synopsis
__mmask8 _mm512_cmpeq_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpeq_pd_mask (__mmask8 k1, __m512d a, __m512d b)
Synopsis
__mmask8 _mm512_mask_cmpeq_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := (a[i+63:i] == b[i+63:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
cmpps
__m128 _mm_cmpeq_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpeq_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
vcmpps
__mmask16 _mm512_cmpeq_ps_mask (__m512 a, __m512 b)
Synopsis
__mmask16 _mm512_cmpeq_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpeq_ps_mask (__mmask16 k1, __m512 a, __m512 b)
Synopsis
__mmask16 _mm512_mask_cmpeq_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
y
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := (a[i+31:i] == b[i+31:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
cmpsd
__m128d _mm_cmpeq_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpeq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for equality, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] == b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpeq_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpeq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for equality, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
pcmpestri
int _mm_cmpestra (__m128i a, int la, __m128i b, int lb, const int imm8)
Synopsis
int _mm_cmpestra (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings in
a and
b with lengths
la and
lb using the control in
imm8, and returns 1 if
b did not contain a null character and the resulting mask was zero, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF i == la
aInvalid := 1
FI
IF j == lb
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF i >= lb // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
dst := (IntRes2 == 0) AND (lb > UpperBound)
Performance
pcmpestri
int _mm_cmpestrc (__m128i a, int la, __m128i b, int lb, const int imm8)
Synopsis
int _mm_cmpestrc (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings in
a and
b with lengths
la and
lb using the control in
imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF i == la
aInvalid := 1
FI
IF j == lb
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF i >= lb // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
dst := (IntRes2 != 0)
Performance
pcmpestri
int _mm_cmpestri (__m128i a, int la, __m128i b, int lb, const int imm8)
Synopsis
int _mm_cmpestri (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings in
a and
b with lengths
la and
lb using the control in
imm8, and store the generated index in
dst.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF i == la
aInvalid := 1
FI
IF j == lb
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF i >= lb // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
IF imm8[6] // most significant bit
tmp := UpperBound
dst := tmp
DO WHILE ((tmp >= 0) AND a[tmp] = 0)
tmp := tmp - 1
dst := tmp
OD
ELSE // least significant bit
tmp := 0
dst := tmp
DO WHILE ((tmp <= UpperBound) AND a[tmp] = 0)
tmp := tmp + 1
dst := tmp
OD
FI
Performance
pcmpestrm
__m128i _mm_cmpestrm (__m128i a, int la, __m128i b, int lb, const int imm8)
Synopsis
__m128i _mm_cmpestrm (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestrm xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings in
a and
b with lengths
la and
lb using the control in
imm8, and store the generated mask in
dst.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF i == la
aInvalid := 1
FI
IF j == lb
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF i >= lb // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
IF imm8[6] // byte / word mask
FOR i := 0 to UpperBound
j := i*size
IF IntRes2[i]
dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
ELSE
dst[j+size-1:j] := 0
FI
ENDFOR
ELSE // bit mask
dst[UpperBound:0] := IntRes[UpperBound:0]
dst[127:UpperBound+1] := 0
FI
Performance
pcmpestri
int _mm_cmpestro (__m128i a, int la, __m128i b, int lb, const int imm8)
Synopsis
int _mm_cmpestro (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings in
a and
b with lengths
la and
lb using the control in
imm8, and returns bit 0 of the resulting bit mask.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF i == la
aInvalid := 1
FI
IF j == lb
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF i >= lb // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
dst := IntRes2[0
Performance
pcmpestri
int _mm_cmpestrs (__m128i a, int la, __m128i b, int lb, const int imm8)
Synopsis
int _mm_cmpestrs (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings in
a and
b with lengths
la and
lb using the control in
imm8, and returns 1 if any character in
a was null, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
dst := (la <= UpperBound)
Performance
pcmpestri
int _mm_cmpestrz (__m128i a, int la, __m128i b, int lb, const int imm8)
Synopsis
int _mm_cmpestrz (__m128i a, int la, __m128i b, int lb, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpestri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings in
a and
b with lengths
la and
lb using the control in
imm8, and returns 1 if any character in
b was null, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
dst := (lb <= UpperBound)
Performance
vpcmpw
__mmask8 _mm_cmpge_epi16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpge_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmpge_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpge_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmpge_epi16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmpge_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmpge_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmpge_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmpge_epi16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmpge_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmpge_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmpge_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpd
__mmask8 _mm_cmpge_epi32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpge_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmpge_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpge_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmpge_epi32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpge_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmpge_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpge_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask16 _mm512_cmpge_epi32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmpge_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmpge_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmpge_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmpge_epi64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpge_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmpge_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpge_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmpge_epi64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpge_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmpge_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpge_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmpge_epi64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmpge_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmpge_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmpge_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpb
__mmask16 _mm_cmpge_epi8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmpge_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmpge_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmpge_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmpge_epi8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmpge_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmpge_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmpge_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmpge_epi8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmpge_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmpge_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmpge_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmpge_epu16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpge_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmpge_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpge_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmpge_epu16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmpge_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmpge_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmpge_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmpge_epu16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmpge_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmpge_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmpge_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmpge_epu32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpge_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmpge_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpge_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmpge_epu32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpge_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmpge_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpge_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmpge_epu32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmpge_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmpge_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmpge_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] >= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmpge_epu64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpge_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmpge_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpge_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmpge_epu64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpge_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmpge_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpge_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmpge_epu64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmpge_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmpge_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmpge_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] >= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmpge_epu8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmpge_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmpge_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmpge_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmpge_epu8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmpge_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmpge_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmpge_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmpge_epu8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmpge_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmpge_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmpge_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
cmppd
__m128d _mm_cmpge_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpge_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than-or-equal, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] >= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
cmpps
__m128 _mm_cmpge_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpge_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than-or-equal, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] >= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
cmpsd
__m128d _mm_cmpge_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpge_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for greater-than-or-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] >= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpge_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpge_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for greater-than-or-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := ( a[31:0] >= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
pcmpgtw
__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmpgt_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtw xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 16-bit integers in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
Performance
vpcmpgtw
__m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_cmpgt_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpgtw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 16-bit integers in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vpcmpw
__mmask8 _mm_cmpgt_epi16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpgt_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmpgt_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpgt_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] >== b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmpgt_epi16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmpgt_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmpgt_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmpgt_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmpgt_epi16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmpgt_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmpgt_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmpgt_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
pcmpgtd
__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmpgt_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtd xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 32-bit integers in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
Performance
vpcmpgtd
__m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_cmpgt_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpgtd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 32-bit integers in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vpcmpd
__mmask8 _mm_cmpgt_epi32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpgt_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmpgt_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpgt_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmpgt_epi32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpgt_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmpgt_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpgt_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpgtd
__mmask16 _mm512_cmpgt_epi32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmpgt_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpgtd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpgtd
__mmask16 _mm512_mask_cmpgt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmpgt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpgtd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
pcmpgtq
__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmpgt_epi64 (__m128i a, __m128i b)
#include "nmmintrin.h"
Instruction: pcmpgtq xmm, xmm
CPUID Flags: SSE4.2
Description
Compare packed 64-bit integers in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vpcmpgtq
__m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_cmpgt_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpgtq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 64-bit integers in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vpcmpq
__mmask8 _mm_cmpgt_epi64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpgt_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmpgt_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpgt_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmpgt_epi64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpgt_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmpgt_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpgt_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpgtq
__mmask8 _mm512_cmpgt_epi64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmpgt_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpgtq k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpgtq
__mmask8 _mm512_mask_cmpgt_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmpgt_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpgtq k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
pcmpgtb
__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmpgt_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtb xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 8-bit integers in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0
ENDFOR
Performance
vpcmpgtb
__m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_cmpgt_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpgtb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 8-bit integers in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0
ENDFOR
dst[MAX:256] := 0
Performance
vpcmpb
__mmask16 _mm_cmpgt_epi8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmpgt_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmpgt_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmpgt_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmpgt_epi8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmpgt_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmpgt_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmpgt_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmpgt_epi8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmpgt_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmpgt_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmpgt_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmpgt_epu16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpgt_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmpgt_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpgt_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] >== b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmpgt_epu16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmpgt_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmpgt_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmpgt_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmpgt_epu16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmpgt_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmpgt_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmpgt_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmpgt_epu32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpgt_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmpgt_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpgt_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmpgt_epu32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpgt_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmpgt_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpgt_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmpgt_epu32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmpgt_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmpgt_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmpgt_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] > b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmpgt_epu64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpgt_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmpgt_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpgt_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmpgt_epu64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpgt_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmpgt_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpgt_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmpgt_epu64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmpgt_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmpgt_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmpgt_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] > b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmpgt_epu8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmpgt_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmpgt_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmpgt_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmpgt_epu8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmpgt_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmpgt_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmpgt_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmpgt_epu8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmpgt_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmpgt_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmpgt_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
cmppd
__m128d _mm_cmpgt_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpgt_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] > b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
cmpps
__m128 _mm_cmpgt_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpgt_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for greater-than, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
cmpsd
__m128d _mm_cmpgt_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpgt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for greater-than, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] > b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpgt_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpgt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for greater-than, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := ( a[31:0] > b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
pcmpistri
int _mm_cmpistra (__m128i a, __m128i b, const int imm8)
Synopsis
int _mm_cmpistra (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings with implicit lengths in
a and
b using the control in
imm8, and returns 1 if
b did not contain a null character and the resulting mask was zero, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF a[m+size-1:m] == 0
aInvalid := 1
FI
IF b[n+size-1:n] == 0
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF b[n+size-1:n] == 0
bInvalid := 1
FI
IF bInvalid // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
dst := (IntRes2 == 0) AND bInvalid
Performance
pcmpistri
int _mm_cmpistrc (__m128i a, __m128i b, const int imm8)
Synopsis
int _mm_cmpistrc (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings with implicit lengths in
a and
b using the control in
imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF a[m+size-1:m] == 0
aInvalid := 1
FI
IF b[n+size-1:n] == 0
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF b[n+size-1:n] == 0
bInvalid := 1
FI
IF bInvalid // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
dst := (IntRes2 != 0)
Performance
pcmpistri
int _mm_cmpistri (__m128i a, __m128i b, const int imm8)
Synopsis
int _mm_cmpistri (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings with implicit lengths in
a and
b using the control in
imm8, and store the generated index in
dst.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF a[m+size-1:m] == 0
aInvalid := 1
FI
IF b[n+size-1:n] == 0
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF b[n+size-1:n] == 0
bInvalid := 1
FI
IF bInvalid // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
IF imm8[6] // most significant bit
tmp := UpperBound
dst := tmp
DO WHILE ((tmp >= 0) AND a[tmp] = 0)
tmp := tmp - 1
dst := tmp
OD
ELSE // least significant bit
tmp := 0
dst := tmp
DO WHILE ((tmp <= UpperBound) AND a[tmp] = 0)
tmp := tmp + 1
dst := tmp
OD
FI
Performance
pcmpistrm
__m128i _mm_cmpistrm (__m128i a, __m128i b, const int imm8)
Synopsis
__m128i _mm_cmpistrm (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistrm xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings with implicit lengths in
a and
b using the control in
imm8, and store the generated mask in
dst.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF a[m+size-1:m] == 0
aInvalid := 1
FI
IF b[n+size-1:n] == 0
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF b[n+size-1:n] == 0
bInvalid := 1
FI
IF bInvalid // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
IF imm8[6] // byte / word mask
FOR i := 0 to UpperBound
j := i*size
IF IntRes2[i]
dst[j+size-1:j] := (imm8[0] ? 0xFF : 0xFFFF)
ELSE
dst[j+size-1:j] := 0
FI
ENDFOR
ELSE // bit mask
dst[UpperBound:0] := IntRes[UpperBound:0]
dst[127:UpperBound+1] := 0
FI
Performance
pcmpistri
int _mm_cmpistro (__m128i a, __m128i b, const int imm8)
Synopsis
int _mm_cmpistro (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings with implicit lengths in
a and
b using the control in
imm8, and returns bit 0 of the resulting bit mask.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
// compare all characters
aInvalid := 0
bInvalid := 0
FOR i := 0 to UpperBound
m := i*size
FOR j := 0 to UpperBound
n := j*size
BoolRes[i][j] := (a[m+size-1:m] == b[n+size-1:n])
// invalidate characters after EOS
IF a[m+size-1:m] == 0
aInvalid := 1
FI
IF b[n+size-1:n] == 0
bInvalid := 1
FI
// override comparisons for invalid characters
CASE (imm8[3:2]) OF
0: // equal any
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
1: // ranges
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 0
FI
2: // equal each
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 0
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
3: // equal ordered
IF (!aInvalid && bInvalid)
BoolRes[i][j] := 0
ELSE IF (aInvalid && !bInvalid)
BoolRes[i][j] := 1
ELSE If (aInvalid && bInvalid)
BoolRes[i][j] := 1
FI
ESAC
ENDFOR
ENDFOR
// aggregate results
CASE (imm8[3:2]) OF
0: // equal any
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound
IntRes1[i] := IntRes1[i] OR BoolRes[i][j]
ENDFOR
ENDFOR
1: // ranges
IntRes1 := 0
FOR i := 0 to UpperBound
FOR j := 0 to UpperBound, j += 2
IntRes1[i] := IntRes1[i] OR (BoolRes[i][j] AND BoolRes[i][j+1])
ENDFOR
ENDFOR
2: // equal each
IntRes1 := 0
FOR i := 0 to UpperBound
IntRes1[i] := BoolRes[i][i]
ENDFOR
3: // equal ordered
IntRes1 := (imm8[0] ? 0xFF : 0xFFFF)
FOR i := 0 to UpperBound
k := i
FOR j := 0 to UpperBound-i
IntRes1[i] := IntRes1[i] AND BoolRes[k][j]
k++
ENDFOR
ENDFOR
ESAC
// optionally negate results
bInvalid := 0
FOR i := 0 to UpperBound
IF imm8[4]
IF imm8[5] // only negate valid
IF b[n+size-1:n] == 0
bInvalid := 1
FI
IF bInvalid // invalid, don't negate
IntRes2[i] := IntRes1[i]
ELSE // valid, negate
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // negate all
IntRes2[i] := -1 XOR IntRes1[i]
FI
ELSE // don't negate
IntRes2[i] := IntRes1[i]
FI
ENDFOR
// output
dst := IntRes2[0]
Performance
pcmpistri
int _mm_cmpistrs (__m128i a, __m128i b, const int imm8)
Synopsis
int _mm_cmpistrs (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings with implicit lengths in
a and
b using the control in
imm8, and returns 1 if any character in
a was null, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
aInvalid := 0
FOR i := 0 to UpperBound
m := i*size
IF b[m+size-1:m] == 0
aInvalid := 1
FI
ENDFOR
dst := aInvalid
Performance
pcmpistri
int _mm_cmpistrz (__m128i a, __m128i b, const int imm8)
Synopsis
int _mm_cmpistrz (__m128i a, __m128i b, const int imm8)
#include "nmmintrin.h"
Instruction: pcmpistri xmm, xmm, imm
CPUID Flags: SSE4.2
Description
Compare packed strings with implicit lengths in
a and
b using the control in
imm8, and returns 1 if any character in
b was null, and 0 otherwise.
imm can be a combination of:
_SIDD_UBYTE_OPS // unsigned 8-bit characters
_SIDD_UWORD_OPS // unsigned 16-bit characters
_SIDD_SBYTE_OPS // signed 8-bit characters
_SIDD_SWORD_OPS // signed 16-bit characters
_SIDD_CMP_EQUAL_ANY // compare equal any
_SIDD_CMP_RANGES // compare ranges
_SIDD_CMP_EQUAL_EACH // compare equal each
_SIDD_CMP_EQUAL_ORDERED // compare equal ordered
_SIDD_NEGATIVE_POLARITY // negate results
_SIDD_MASKED_NEGATIVE_POLARITY // negate results only before end of string
_SIDD_LEAST_SIGNIFICANT // index only: return last significant bit
_SIDD_MOST_SIGNIFICANT // index only: return most significant bit
_SIDD_BIT_MASK // mask only: return bit mask
_SIDD_UNIT_MASK // mask only: return byte/word mask
Operation
size := (imm8[0] ? 16 : 8) // 8 or 16-bit characters
UpperBound := (128 / size) - 1
bInvalid := 0
FOR j := 0 to UpperBound
n := j*size
IF b[n+size-1:n] == 0
bInvalid := 1
FI
ENDFOR
dst := bInvalid
Performance
vpcmpw
__mmask8 _mm_cmple_epi16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmple_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmple_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmple_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmple_epi16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmple_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmple_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmple_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmple_epi16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmple_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmple_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmple_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpd
__mmask8 _mm_cmple_epi32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmple_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmple_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmple_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmple_epi32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmple_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmple_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmple_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask16 _mm512_cmple_epi32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmple_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmple_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmple_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmple_epi64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmple_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmple_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmple_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmple_epi64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmple_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmple_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmple_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmple_epi64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmple_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmple_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmple_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpb
__mmask16 _mm_cmple_epi8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmple_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmple_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmple_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmple_epi8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmple_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmple_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmple_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmple_epi8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmple_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmple_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmple_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmple_epu16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmple_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmple_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmple_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmple_epu16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmple_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmple_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmple_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmple_epu16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmple_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmple_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmple_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmple_epu32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmple_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmple_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmple_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmple_epu32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmple_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmple_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmple_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmple_epu32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmple_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmple_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmple_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmple_epu64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmple_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmple_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmple_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmple_epu64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmple_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmple_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmple_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmple_epu64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmple_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmple_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmple_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] <= b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmple_epu8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmple_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmple_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmple_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmple_epu8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmple_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmple_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmple_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmple_epu8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmple_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmple_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmple_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
cmppd
__m128d _mm_cmple_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmple_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] <= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vcmppd
__mmask8 _mm512_cmple_pd_mask (__m512d a, __m512d b)
Synopsis
__mmask8 _mm512_cmple_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmple_pd_mask (__mmask8 k1, __m512d a, __m512d b)
Synopsis
__mmask8 _mm512_mask_cmple_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := (a[i+63:i] <= b[i+63:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
cmpps
__m128 _mm_cmple_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmple_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] <= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
vcmpps
__mmask16 _mm512_cmple_ps_mask (__m512 a, __m512 b)
Synopsis
__mmask16 _mm512_cmple_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmple_ps_mask (__mmask16 k1, __m512 a, __m512 b)
Synopsis
__mmask16 _mm512_mask_cmple_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := (a[i+31:i] <= b[i+31:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
cmpsd
__m128d _mm_cmple_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmple_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] <= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmple_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmple_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := ( a[31:0] <= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
pcmpgtw
__m128i _mm_cmplt_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmplt_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtw xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 16-bit integers in a and b for less-than, and store the results in dst. Note: This intrinsic emits the pcmpgtw instruction with the order of the operands switched.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := ( a[i+15:i] < b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
Performance
vpcmpw
__mmask8 _mm_cmplt_epi16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmplt_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmplt_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmplt_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmplt_epi16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmplt_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmplt_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmplt_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmplt_epi16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmplt_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmplt_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmplt_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
pcmpgtd
__m128i _mm_cmplt_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmplt_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtd xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 32-bit integers in a and b for less-than, and store the results in dst. Note: This intrinsic emits the pcmpgtd instruction with the order of the operands switched.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
Performance
vpcmpd
__mmask8 _mm_cmplt_epi32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmplt_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmplt_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmplt_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmplt_epi32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmplt_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmplt_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmplt_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpltd
__mmask16 _mm512_cmplt_epi32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmplt_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpltd k {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_cmplt_epi32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmplt_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 32-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmplt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmplt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpltd
__mmask16 _mm512_mask_cmplt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmplt_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpltd k {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Compare packed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmplt_epi64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmplt_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmplt_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmplt_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmplt_epi64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmplt_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmplt_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmplt_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmplt_epi64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmplt_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmplt_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmplt_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
pcmpgtb
__m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_cmplt_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pcmpgtb xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 8-bit integers in a and b for less-than, and store the results in dst. Note: This intrinsic emits the pcmpgtb instruction with the order of the operands switched.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := ( a[i+7:i] < b[i+7:i] ) ? 0xFF : 0
ENDFOR
Performance
vpcmpb
__mmask16 _mm_cmplt_epi8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmplt_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmplt_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmplt_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmplt_epi8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmplt_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmplt_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmplt_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmplt_epi8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmplt_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmplt_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmplt_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmplt_epu16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmplt_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmplt_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmplt_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmplt_epu16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmplt_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmplt_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmplt_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmplt_epu16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmplt_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmplt_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmplt_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmplt_epu32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmplt_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmplt_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmplt_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmplt_epu32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmplt_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmplt_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmplt_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmplt_epu32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmplt_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] < b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmplt_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmplt_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] <= b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmplt_epu64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmplt_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmplt_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmplt_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmplt_epu64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmplt_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmplt_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmplt_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmplt_epu64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmplt_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmplt_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmplt_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] < b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmplt_epu8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmplt_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmplt_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmplt_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmplt_epu8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmplt_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmplt_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmplt_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmplt_epu8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmplt_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmplt_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmplt_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
cmppd
__m128d _mm_cmplt_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmplt_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] < b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vcmppd
__mmask8 _mm512_cmplt_pd_mask (__m512d a, __m512d b)
Synopsis
__mmask8 _mm512_cmplt_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmplt_pd_mask (__mmask8 k1, __m512d a, __m512d b)
Synopsis
__mmask8 _mm512_mask_cmplt_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := (a[i+63:i] < b[i+63:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
cmpps
__m128 _mm_cmplt_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmplt_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
vcmpps
__mmask16 _mm512_cmplt_ps_mask (__m512 a, __m512 b)
Synopsis
__mmask16 _mm512_cmplt_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmplt_ps_mask (__mmask16 k1, __m512 a, __m512 b)
Synopsis
__mmask16 _mm512_mask_cmplt_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := (a[i+31:i] < b[i+31:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
cmpsd
__m128d _mm_cmplt_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmplt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for less-than, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] < b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmplt_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmplt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for less-than, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := ( a[31:0] < b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
vpcmpw
__mmask8 _mm_cmpneq_epi16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpneq_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask8 _mm_mask_cmpneq_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpneq_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpw
__mmask16 _mm256_cmpneq_epi16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmpneq_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask16 _mm256_mask_cmpneq_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmpneq_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpw
__mmask32 _mm512_cmpneq_epi16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmpneq_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpw
__mmask32 _mm512_mask_cmpneq_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmpneq_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpd
__mmask8 _mm_cmpneq_epi32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpneq_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm_mask_cmpneq_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpneq_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpd
__mmask8 _mm256_cmpneq_epi32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpneq_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask8 _mm256_mask_cmpneq_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpneq_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpd
__mmask16 _mm512_cmpneq_epi32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmpneq_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpd
__mmask16 _mm512_mask_cmpneq_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmpneq_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpq
__mmask8 _mm_cmpneq_epi64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpneq_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm_mask_cmpneq_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpneq_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpq
__mmask8 _mm256_cmpneq_epi64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpneq_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm256_mask_cmpneq_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpneq_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpq
__mmask8 _mm512_cmpneq_epi64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmpneq_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpq
__mmask8 _mm512_mask_cmpneq_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmpneq_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpb
__mmask16 _mm_cmpneq_epi8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmpneq_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask16 _mm_mask_cmpneq_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmpneq_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpb
__mmask32 _mm256_cmpneq_epi8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmpneq_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask32 _mm256_mask_cmpneq_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmpneq_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpb
__mmask64 _mm512_cmpneq_epi8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmpneq_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpb
__mmask64 _mm512_mask_cmpneq_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmpneq_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vpcmpuw
__mmask8 _mm_cmpneq_epu16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpneq_epu16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask8 _mm_mask_cmpneq_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpneq_epu16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpuw
__mmask16 _mm256_cmpneq_epu16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_cmpneq_epu16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask16 _mm256_mask_cmpneq_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_cmpneq_epu16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuw
__mmask32 _mm512_cmpneq_epu16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_cmpneq_epu16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpuw
__mmask32 _mm512_mask_cmpneq_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_cmpneq_epu16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpud
__mmask8 _mm_cmpneq_epu32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpneq_epu32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm_mask_cmpneq_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpneq_epu32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpud
__mmask8 _mm256_cmpneq_epu32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpneq_epu32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask8 _mm256_mask_cmpneq_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpneq_epu32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpud
__mmask16 _mm512_cmpneq_epu32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_cmpneq_epu32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpud
__mmask16 _mm512_mask_cmpneq_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_cmpneq_epu32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpud k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ( a[i+31:i] != b[i+31:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpuq
__mmask8 _mm_cmpneq_epu64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_cmpneq_epu64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm_mask_cmpneq_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_cmpneq_epu64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpcmpuq
__mmask8 _mm256_cmpneq_epu64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_cmpneq_epu64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm256_mask_cmpneq_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_cmpneq_epu64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpcmpuq
__mmask8 _mm512_cmpneq_epu64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_cmpneq_epu64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vpcmpuq
__mmask8 _mm512_mask_cmpneq_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_cmpneq_epu64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpuq k {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k1 using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ( a[i+63:i] != b[i+63:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpcmpub
__mmask16 _mm_cmpneq_epu8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_cmpneq_epu8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask16 _mm_mask_cmpneq_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_cmpneq_epu8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpcmpub
__mmask32 _mm256_cmpneq_epu8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_cmpneq_epu8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask32 _mm256_mask_cmpneq_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_cmpneq_epu8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpcmpub
__mmask64 _mm512_cmpneq_epu8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_cmpneq_epu8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vpcmpub
__mmask64 _mm512_mask_cmpneq_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_cmpneq_epu8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpcmpub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
cmppd
__m128d _mm_cmpneq_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpneq_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] != b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vcmppd
__mmask8 _mm512_cmpneq_pd_mask (__m512d a, __m512d b)
Synopsis
__mmask8 _mm512_cmpneq_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpneq_pd_mask (__mmask8 k1, __m512d a, __m512d b)
Synopsis
__mmask8 _mm512_mask_cmpneq_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := (a[i+63:i] != b[i+63:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
cmpps
__m128 _mm_cmpneq_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpneq_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
vcmpps
__mmask16 _mm512_cmpneq_ps_mask (__m512 a, __m512 b)
Synopsis
__mmask16 _mm512_cmpneq_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpneq_ps_mask (__mmask16 k1, __m512 a, __m512 b)
Synopsis
__mmask16 _mm512_mask_cmpneq_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := (a[i+31:i] != b[i+31:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
cmpsd
__m128d _mm_cmpneq_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpneq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for not-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] != b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpneq_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpneq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for not-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
cmppd
__m128d _mm_cmpnge_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpnge_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-greater-than-or-equal, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := !(a[i+63:i] >= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
cmpps
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpnge_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-greater-than-or-equal, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := !( a[i+31:i] >= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
cmpsd
__m128d _mm_cmpnge_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpnge_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for not-greater-than-or-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := !(a[63:0] >= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpnge_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpnge_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for not-greater-than-or-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := !( a[31:0] >= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
cmppd
__m128d _mm_cmpngt_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpngt_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-greater-than, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := !(a[i+63:i] > b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
cmpps
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpngt_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-greater-than, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := !( a[i+31:i] > b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
cmpsd
__m128d _mm_cmpngt_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpngt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for not-greater-than, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := !(a[63:0] > b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpngt_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpngt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for not-greater-than, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := !( a[31:0] > b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
cmppd
__m128d _mm_cmpnle_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpnle_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := !(a[i+63:i] <= b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vcmppd
__mmask8 _mm512_cmpnle_pd_mask (__m512d a, __m512d b)
Synopsis
__mmask8 _mm512_cmpnle_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := !(a[i+63:i] <= b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpnle_pd_mask (__mmask8 k1, __m512d a, __m512d b)
Synopsis
__mmask8 _mm512_mask_cmpnle_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := !(a[i+63:i] <= b[i+63:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
cmpps
__m128 _mm_cmpnle_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpnle_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := !( a[i+31:i] <= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
vcmpps
__mmask16 _mm512_cmpnle_ps_mask (__m512 a, __m512 b)
Synopsis
__mmask16 _mm512_cmpnle_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := !(a[i+31:i] <= b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpnle_ps_mask (__mmask16 k1, __m512 a, __m512 b)
Synopsis
__mmask16 _mm512_mask_cmpnle_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := !(a[i+31:i] <= b[i+31:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
cmpsd
__m128d _mm_cmpnle_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpnle_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := !(a[63:0] <= b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpnle_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpnle_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := !( a[31:0] <= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
cmppd
__m128d _mm_cmpnlt_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpnlt_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := !(a[i+63:i] < b[i+63:i]) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vcmppd
__mmask8 _mm512_cmpnlt_pd_mask (__m512d a, __m512d b)
Synopsis
__mmask8 _mm512_cmpnlt_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := !(a[i+63:i] < b[i+63:i]) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpnlt_pd_mask (__mmask8 k1, __m512d a, __m512d b)
Synopsis
__mmask8 _mm512_mask_cmpnlt_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := !(a[i+63:i] < b[i+63:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
cmpps
__m128 _mm_cmpnlt_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpnlt_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := !( a[i+31:i] < b[i+31:i] ) ? 0xffffffff : 0
ENDFOR
Performance
vcmpps
__mmask16 _mm512_cmpnlt_ps_mask (__m512 a, __m512 b)
Synopsis
__mmask16 _mm512_cmpnlt_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := !(a[i+31:i] < b[i+31:i]) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpnlt_ps_mask (__mmask16 k1, __m512 a, __m512 b)
Synopsis
__mmask16 _mm512_mask_cmpnlt_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := !(a[i+31:i] < b[i+31:i]) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
cmpsd
__m128d _mm_cmpnlt_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpnlt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b for not-less-than, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := !(a[63:0] < b[63:0]) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpnlt_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpnlt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b for not-less-than, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := !( a[31:0] < b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
cmppd
__m128d _mm_cmpord_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpord_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vcmppd
__mmask8 _mm512_cmpord_pd_mask (__m512d a, __m512d b)
Synopsis
__mmask8 _mm512_cmpord_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpord_pd_mask (__mmask8 k1, __m512d a, __m512d b)
Synopsis
__mmask8 _mm512_mask_cmpord_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := (a[i+63:i] != NaN AND b[i+63:i] != NaN) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
cmpps
__m128 _mm_cmpord_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpord_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xffffffff : 0
ENDFOR
Performance
vcmpps
__mmask16 _mm512_cmpord_ps_mask (__m512 a, __m512 b)
Synopsis
__mmask16 _mm512_cmpord_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := (a[i+31:i] != NaN AND b[i+31:i] != NaN) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpord_ps_mask (__mmask16 k1, __m512 a, __m512 b)
Synopsis
__mmask16 _mm512_mask_cmpord_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := (a[i+31:i] != NaN AND b[i+31:i] != NaN) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
cmpsd
__m128d _mm_cmpord_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpord_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] != NaN AND b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpord_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpord_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
cmppd
__m128d _mm_cmpunord_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpunord_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmppd xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] != NaN OR b[i+63:i] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
Performance
vcmppd
__mmask8 _mm512_cmpunord_pd_mask (__m512d a, __m512d b)
Synopsis
__mmask8 _mm512_cmpunord_pd_mask (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
Operation
FOR j := 0 to 7
i := j*64
k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vcmppd
__mmask8 _mm512_mask_cmpunord_pd_mask (__mmask8 k1, __m512d a, __m512d b)
Synopsis
__mmask8 _mm512_mask_cmpunord_pd_mask (__mmask8 k1, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vcmppd k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := (a[i+63:i] == NaN OR b[i+63:i] == NaN) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
cmpps
__m128 _mm_cmpunord_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpunord_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpps xmm, xmm, imm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ( a[i+31:i] != NaN OR b[i+31:i] != NaN ) ? 0xffffffff : 0
ENDFOR
Performance
vcmpps
__mmask16 _mm512_cmpunord_ps_mask (__m512 a, __m512 b)
Synopsis
__mmask16 _mm512_cmpunord_ps_mask (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
Operation
FOR j := 0 to 15
i := j*32
k[j] := (a[i+31:i] == NaN OR b[i+31:i] == NaN) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vcmpps
__mmask16 _mm512_mask_cmpunord_ps_mask (__mmask16 k1, __m512 a, __m512 b)
Synopsis
__mmask16 _mm512_mask_cmpunord_ps_mask (__mmask16 k1, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vcmpps k {k}, zmm, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := (a[i+31:i] == NaN OR b[i+31:i] == NaN) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
cmpsd
__m128d _mm_cmpunord_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_cmpunord_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: cmpsd xmm, xmm, xmm, imm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b to see if either is NaN, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] != NaN OR b[63:0] != NaN) ? 0xFFFFFFFFFFFFFFFF : 0
dst[127:64] := a[127:64]
Performance
cmpss
__m128 _mm_cmpunord_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_cmpunord_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: cmpss xmm, xmm, imm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b to see if either is NaN, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := ( a[31:0] != NaN OR b[31:0] != NaN ) ? 0xffffffff : 0
dst[127:32] := a[127:32]
Performance
vcomisd
int _mm_comi_round_sd (__m128d a, __m128d b, const int imm8, const int sae)
Synopsis
int _mm_comi_round_sd (__m128d a, __m128d b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcomisd xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
RETURN ( a[63:0] OP b[63:0] ) ? 1 : 0
vcomiss
int _mm_comi_round_ss (__m128 a, __m128 b, const int imm8, const int sae)
Synopsis
int _mm_comi_round_ss (__m128 a, __m128 b, const int imm8, const int sae)
#include "immintrin.h"
Instruction: vcomiss xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
CASE (imm8[7:0]) OF
0: OP := _CMP_EQ_OQ
1: OP := _CMP_LT_OS
2: OP := _CMP_LE_OS
3: OP := _CMP_UNORD_Q
4: OP := _CMP_NEQ_UQ
5: OP := _CMP_NLT_US
6: OP := _CMP_NLE_US
7: OP := _CMP_ORD_Q
8: OP := _CMP_EQ_UQ
9: OP := _CMP_NGE_US
10: OP := _CMP_NGT_US
11: OP := _CMP_FALSE_OQ
12: OP := _CMP_NEQ_OQ
13: OP := _CMP_GE_OS
14: OP := _CMP_GT_OS
15: OP := _CMP_TRUE_UQ
16: OP := _CMP_EQ_OS
17: OP := _CMP_LT_OQ
18: OP := _CMP_LE_OQ
19: OP := _CMP_UNORD_S
20: OP := _CMP_NEQ_US
21: OP := _CMP_NLT_UQ
22: OP := _CMP_NLE_UQ
23: OP := _CMP_ORD_S
24: OP := _CMP_EQ_US
25: OP := _CMP_NGE_UQ
26: OP := _CMP_NGT_UQ
27: OP := _CMP_FALSE_OS
28: OP := _CMP_NEQ_OS
29: OP := _CMP_GE_OQ
30: OP := _CMP_GT_OQ
31: OP := _CMP_TRUE_US
ESAC
RETURN ( a[31:0] OP b[31:0] ) ? 1 : 0
comisd
int _mm_comieq_sd (__m128d a, __m128d b)
Synopsis
int _mm_comieq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for equality, and return the boolean result (0 or 1).
Operation
RETURN ( a[63:0] == b[63:0] ) ? 1 : 0
Performance
comiss
int _mm_comieq_ss (__m128 a, __m128 b)
Synopsis
int _mm_comieq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for equality, and return the boolean result (0 or 1).
Operation
RETURN ( a[31:0] == b[31:0] ) ? 1 : 0
Performance
comisd
int _mm_comige_sd (__m128d a, __m128d b)
Synopsis
int _mm_comige_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than-or-equal, and return the boolean result (0 or 1).
Operation
RETURN ( a[63:0] >= b[63:0] ) ? 1 : 0
Performance
comiss
int _mm_comige_ss (__m128 a, __m128 b)
Synopsis
int _mm_comige_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for greater-than-or-equal, and return the boolean result (0 or 1).
Operation
RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0
Performance
comisd
int _mm_comigt_sd (__m128d a, __m128d b)
Synopsis
int _mm_comigt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than, and return the boolean result (0 or 1).
Operation
RETURN ( a[63:0] > b[63:0] ) ? 1 : 0
Performance
comiss
int _mm_comigt_ss (__m128 a, __m128 b)
Synopsis
int _mm_comigt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for greater-than, and return the boolean result (0 or 1).
Operation
RETURN ( a[31:0] > b[31:0] ) ? 1 : 0
Performance
comisd
int _mm_comile_sd (__m128d a, __m128d b)
Synopsis
int _mm_comile_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for less-than-or-equal, and return the boolean result (0 or 1).
Operation
RETURN ( a[63:0] <= b[63:0] ) ? 1 : 0
Performance
comiss
int _mm_comile_ss (__m128 a, __m128 b)
Synopsis
int _mm_comile_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for less-than-or-equal, and return the boolean result (0 or 1).
Operation
RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0
Performance
comisd
int _mm_comilt_sd (__m128d a, __m128d b)
Synopsis
int _mm_comilt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for less-than, and return the boolean result (0 or 1).
Operation
RETURN ( a[63:0] < b[63:0] ) ? 1 : 0
Performance
comiss
int _mm_comilt_ss (__m128 a, __m128 b)
Synopsis
int _mm_comilt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for less-than, and return the boolean result (0 or 1).
Operation
RETURN ( a[31:0] < b[31:0] ) ? 1 : 0
Performance
comisd
int _mm_comineq_sd (__m128d a, __m128d b)
Synopsis
int _mm_comineq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: comisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for not-equal, and return the boolean result (0 or 1).
Operation
RETURN ( a[63:0] != b[63:0] ) ? 1 : 0
Performance
comiss
int _mm_comineq_ss (__m128 a, __m128 b)
Synopsis
int _mm_comineq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: comiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for not-equal, and return the boolean result (0 or 1).
Operation
RETURN ( a[31:0] != b[31:0] ) ? 1 : 0
Performance
vpcompressd
__m128i _mm_mask_compress_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_compress_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 32
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0
vpcompressd
__m128i _mm_maskz_compress_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_compress_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 32
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0
vpcompressd
__m256i _mm256_mask_compress_epi32 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_compress_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 32
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0
vpcompressd
__m256i _mm256_maskz_compress_epi32 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_compress_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 32
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0
vpcompressd
__m512i _mm512_mask_compress_epi32 (__m512i src, __mmask16 k, __m512i a)
Synopsis
__m512i _mm512_mask_compress_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 32
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0
vpcompressd
__m512i _mm512_maskz_compress_epi32 (__mmask16 k, __m512i a)
Synopsis
__m512i _mm512_maskz_compress_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 32
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0
vpcompressq
__m128i _mm_mask_compress_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_compress_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 64
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0
vpcompressq
__m128i _mm_maskz_compress_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_compress_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 64
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0
vpcompressq
__m256i _mm256_mask_compress_epi64 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_compress_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 64
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0
vpcompressq
__m256i _mm256_maskz_compress_epi64 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_compress_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 64
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0
vpcompressq
__m512i _mm512_mask_compress_epi64 (__m512i src, __mmask8 k, __m512i a)
Synopsis
__m512i _mm512_mask_compress_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 64
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0
vpcompressq
__m512i _mm512_maskz_compress_epi64 (__mmask8 k, __m512i a)
Synopsis
__m512i _mm512_maskz_compress_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 64
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0
vcompresspd
__m128d _mm_mask_compress_pd (__m128d src, __mmask8 k, __m128d a)
Synopsis
__m128d _mm_mask_compress_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 64
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0
vcompresspd
__m128d _mm_maskz_compress_pd (__mmask8 k, __m128d a)
Synopsis
__m128d _mm_maskz_compress_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 64
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0
vcompresspd
__m256d _mm256_mask_compress_pd (__m256d src, __mmask8 k, __m256d a)
Synopsis
__m256d _mm256_mask_compress_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 64
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0
vcompresspd
__m256d _mm256_maskz_compress_pd (__mmask8 k, __m256d a)
Synopsis
__m256d _mm256_maskz_compress_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 64
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0
vcompresspd
__m512d _mm512_mask_compress_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_compress_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcompresspd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 64
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0
vcompresspd
__m512d _mm512_maskz_compress_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_compress_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcompresspd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 64
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0
vcompressps
__m128 _mm_mask_compress_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_compress_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 32
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[127:m] := src[127:m]
dst[MAX:128] := 0
vcompressps
__m128 _mm_maskz_compress_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_compress_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 32
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[127:m] := 0
dst[MAX:128] := 0
vcompressps
__m256 _mm256_mask_compress_ps (__m256 src, __mmask8 k, __m256 a)
Synopsis
__m256 _mm256_mask_compress_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 32
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[255:m] := src[255:m]
dst[MAX:256] := 0
vcompressps
__m256 _mm256_maskz_compress_ps (__mmask8 k, __m256 a)
Synopsis
__m256 _mm256_maskz_compress_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 32
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[255:m] := 0
dst[MAX:256] := 0
vcompressps
__m512 _mm512_mask_compress_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_compress_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcompressps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
Operation
size := 32
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[511:m] := src[511:m]
dst[MAX:512] := 0
vcompressps
__m512 _mm512_maskz_compress_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_compress_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcompressps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
Operation
size := 32
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
dst[511:m] := 0
dst[MAX:512] := 0
vpcompressd
void _mm_mask_compressstoreu_epi32 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_compressstoreu_epi32 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 32
m := base_addr
FOR j := 0 to 3
i := j*32
IF k[j]
MEM[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
vpcompressd
void _mm256_mask_compressstoreu_epi32 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_compressstoreu_epi32 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 32
m := base_addr
FOR j := 0 to 7
i := j*32
IF k[j]
MEM[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
vpcompressd
void _mm512_mask_compressstoreu_epi32 (void* base_addr, __mmask16 k, __m512i a)
Synopsis
void _mm512_mask_compressstoreu_epi32 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressd m32 {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 32
m := base_addr
FOR j := 0 to 15
i := j*32
IF k[j]
MEM[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
vpcompressq
void _mm_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 64
m := base_addr
FOR j := 0 to 1
i := j*64
IF k[j]
MEM[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
vpcompressq
void _mm256_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpcompressq
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 64
m := base_addr
FOR j := 0 to 3
i := j*64
IF k[j]
MEM[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
vpcompressq
void _mm512_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_compressstoreu_epi64 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpcompressq m64 {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 64
m := base_addr
FOR j := 0 to 7
i := j*64
IF k[j]
MEM[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
vcompresspd
void _mm_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m128d a)
Synopsis
void _mm_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 64
m := base_addr
FOR j := 0 to 1
i := j*64
IF k[j]
MEM[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
vcompresspd
void _mm256_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m256d a)
Synopsis
void _mm256_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcompresspd
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 64
m := base_addr
FOR j := 0 to 3
i := j*64
IF k[j]
MEM[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
vcompresspd
void _mm512_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m512d a)
Synopsis
void _mm512_mask_compressstoreu_pd (void* base_addr, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcompresspd m512 {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 64
m := base_addr
FOR j := 0 to 7
i := j*64
IF k[j]
MEM[m+size-1:m] := a[i+63:i]
m := m + size
FI
ENDFOR
vcompressps
void _mm_mask_compressstoreu_ps (void* base_addr, __mmask8 k, __m128 a)
Synopsis
void _mm_mask_compressstoreu_ps (void* base_addr, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 32
m := base_addr
FOR j := 0 to 3
i := j*32
IF k[j]
MEM[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
vcompressps
void _mm256_mask_compressstoreu_ps (void* base_addr, __mmask8 k, __m256 a)
Synopsis
void _mm256_mask_compressstoreu_ps (void* base_addr, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcompressps
CPUID Flags: AVX512VL + AVX512F
Description
Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 32
m := base_addr
FOR j := 0 to 7
i := j*32
IF k[j]
MEM[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
vcompressps
void _mm512_mask_compressstoreu_ps (void* base_addr, __mmask16 k, __m512 a)
Synopsis
void _mm512_mask_compressstoreu_ps (void* base_addr, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcompressps m512 {k}, zmm
CPUID Flags: AVX512F
Description
Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
size := 32
m := base_addr
FOR j := 0 to 15
i := j*32
IF k[j]
MEM[m+size-1:m] := a[i+31:i]
m := m + size
FI
ENDFOR
vpconflictd
__m128i _mm_conflict_epi32 (__m128i a)
Synopsis
__m128i _mm_conflict_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 3
i := j*32
FOR k := 0 to j-1
m := k*32
dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
ENDFOR
dst[i+31:i+j] := 0
ENDFOR
dst[MAX:128] := 0
vpconflictd
__m128i _mm_mask_conflict_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_conflict_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 3
i := j*32
IF k[i]
FOR l := 0 to j-1
m := l*32
dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
ENDFOR
dst[i+31:i+j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpconflictd
__m128i _mm_maskz_conflict_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_conflict_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 3
i := j*32
IF k[i]
FOR l := 0 to j-1
m := l*32
dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
ENDFOR
dst[i+31:i+j] := 0
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpconflictd
__m256i _mm256_conflict_epi32 (__m256i a)
Synopsis
__m256i _mm256_conflict_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 7
i := j*32
FOR k := 0 to j-1
m := k*32
dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
ENDFOR
dst[i+31:i+j] := 0
ENDFOR
dst[MAX:256] := 0
vpconflictd
__m256i _mm256_mask_conflict_epi32 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_conflict_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 7
i := j*32
IF k[i]
FOR l := 0 to j-1
m := l*32
dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
ENDFOR
dst[i+31:i+j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpconflictd
__m256i _mm256_maskz_conflict_epi32 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_conflict_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpconflictd
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 7
i := j*32
IF k[i]
FOR l := 0 to j-1
m := l*32
dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
ENDFOR
dst[i+31:i+j] := 0
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpconflictd
__m512i _mm512_conflict_epi32 (__m512i a)
Synopsis
__m512i _mm512_conflict_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpconflictd zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 15
i := j*32
FOR k := 0 to j-1
m := k*32
dst[i+k] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
ENDFOR
dst[i+31:i+j] := 0
ENDFOR
dst[MAX:512] := 0
vpconflictd
__m512i _mm512_mask_conflict_epi32 (__m512i src, __mmask16 k, __m512i a)
Synopsis
__m512i _mm512_mask_conflict_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpconflictd zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 15
i := j*32
IF k[i]
FOR l := 0 to j-1
m := l*32
dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
ENDFOR
dst[i+31:i+j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpconflictd
__m512i _mm512_maskz_conflict_epi32 (__mmask16 k, __m512i a)
Synopsis
__m512i _mm512_maskz_conflict_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpconflictd zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Test each 32-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 15
i := j*32
IF k[i]
FOR l := 0 to j-1
m := l*32
dst[i+l] := (a[i+31:i] == a[m+31:m]) ? 1 : 0
ENDFOR
dst[i+31:i+j] := 0
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpconflictq
__m128i _mm_conflict_epi64 (__m128i a)
Synopsis
__m128i _mm_conflict_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 1
i := j*64
FOR k := 0 to j-1
m := k*64
dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
ENDFOR
dst[i+63:i+j] := 0
ENDFOR
dst[MAX:128] := 0
vpconflictq
__m128i _mm_mask_conflict_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_conflict_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
FOR l := 0 to j-1
m := l*64
dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
ENDFOR
dst[i+63:i+j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpconflictq
__m128i _mm_maskz_conflict_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_conflict_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
FOR l := 0 to j-1
m := l*64
dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
ENDFOR
dst[i+63:i+j] := 0
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpconflictq
__m256i _mm256_conflict_epi64 (__m256i a)
Synopsis
__m256i _mm256_conflict_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 3
i := j*64
FOR k := 0 to j-1
m := k*64
dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
ENDFOR
dst[i+63:i+j] := 0
ENDFOR
dst[MAX:256] := 0
vpconflictq
__m256i _mm256_mask_conflict_epi64 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_conflict_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
FOR l := 0 to j-1
m := l*64
dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
ENDFOR
dst[i+63:i+j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpconflictq
__m256i _mm256_maskz_conflict_epi64 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_conflict_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpconflictq
CPUID Flags: AVX512VL + AVX512CD
Description
Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
FOR l := 0 to j-1
m := l*64
dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
ENDFOR
dst[i+63:i+j] := 0
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpconflictq
__m512i _mm512_conflict_epi64 (__m512i a)
Synopsis
__m512i _mm512_conflict_epi64 (__m512i a)
#include "immintrin.h"
Instruction: vpconflictq zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit. Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 7
i := j*64
FOR k := 0 to j-1
m := k*64
dst[i+k] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
ENDFOR
dst[i+63:i+j] := 0
ENDFOR
dst[MAX:512] := 0
vpconflictq
__m512i _mm512_mask_conflict_epi64 (__m512i src, __mmask8 k, __m512i a)
Synopsis
__m512i _mm512_mask_conflict_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpconflictq zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using writemask k (elements are copied from src when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
FOR l := 0 to j-1
m := l*64
dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
ENDFOR
dst[i+63:i+j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpconflictq
__m512i _mm512_maskz_conflict_epi64 (__mmask8 k, __m512i a)
Synopsis
__m512i _mm512_maskz_conflict_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpconflictq zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Test each 64-bit element of a for equality with all other elements in a closer to the least significant bit using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Each element's comparison forms a zero extended bit vector in dst.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
FOR l := 0 to j-1
m := l*64
dst[i+l] := (a[i+63:i] == a[m+63:m]) ? 1 : 0
ENDFOR
dst[i+63:i+j] := 0
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_cos_pd (__m128d a)
Synopsis
__m128d _mm_cos_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_cos_pd (__m256d a)
Synopsis
__m256d _mm256_cos_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_cos_pd (__m512d a)
Synopsis
__m512d _mm512_cos_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_cos_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_cos_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := COS(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_cos_ps (__m128 a)
Synopsis
__m128 _mm_cos_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_cos_ps (__m256 a)
Synopsis
__m256 _mm256_cos_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_cos_ps (__m512 a)
Synopsis
__m512 _mm512_cos_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_cos_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_cos_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := COS(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_cosd_pd (__m128d a)
Synopsis
__m128d _mm_cosd_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := COSD(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_cosd_pd (__m256d a)
Synopsis
__m256d _mm256_cosd_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := COSD(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_cosd_pd (__m512d a)
Synopsis
__m512d _mm512_cosd_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := COSD(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_cosd_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_cosd_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cosine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := COSD(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_cosd_ps (__m128 a)
Synopsis
__m128 _mm_cosd_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := COSD(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_cosd_ps (__m256 a)
Synopsis
__m256 _mm256_cosd_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := COSD(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_cosd_ps (__m512 a)
Synopsis
__m512 _mm512_cosd_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := COSD(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_cosd_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_cosd_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the cosine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := COSD(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_cosh_pd (__m128d a)
Synopsis
__m128d _mm_cosh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := COSH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_cosh_pd (__m256d a)
Synopsis
__m256d _mm256_cosh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := COSH(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_cosh_pd (__m512d a)
Synopsis
__m512d _mm512_cosh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := COSH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_cosh_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_cosh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := COSH(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_cosh_ps (__m128 a)
Synopsis
__m128 _mm_cosh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := COSH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_cosh_ps (__m256 a)
Synopsis
__m256 _mm256_cosh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := COSH(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_cosh_ps (__m512 a)
Synopsis
__m512 _mm512_cosh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := COSH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_cosh_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_cosh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := COSH(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
popcnt
unsigned int _mm_countbits_32 (unsigned int r1)
Synopsis
unsigned int _mm_countbits_32 (unsigned int r1)
#include "immintrin.h"
Instruction: popcnt r32, r32
CPUID Flags: KNCNI
Description
Counts the number of set bits in 32-bit unsigned integer r1, returning the results in dst.
Operation
dst[31:0] := PopCount(r1[31:0])
popcnt
unsigned __int64 _mm_countbits_64 (unsigned __int64 r1)
Synopsis
unsigned __int64 _mm_countbits_64 (unsigned __int64 r1)
#include "immintrin.h"
Instruction: popcnt r64, r64
CPUID Flags: KNCNI
Description
Counts the number of set bits in double-precision (32-bit) unsigned integer r1, returning the results in dst.
Operation
dst[63:0] := PopCount(r1[63:0])
crc32
unsigned int _mm_crc32_u16 (unsigned int crc, unsigned short v)
Synopsis
unsigned int _mm_crc32_u16 (unsigned int crc, unsigned short v)
#include "nmmintrin.h"
Instruction: crc32 r32, r16
CPUID Flags: SSE4.2
Description
Starting with the initial value in crc, accumulates a CRC32 value for unsigned 16-bit integer v, and stores the result in dst.
Operation
tmp1[15:0] := v[0:15] // bit reflection
tmp2[31:0] := crc[0:31] // bit reflection
tmp3[47:0] := tmp1[15:0] << 32
tmp4[47:0] := tmp2[31:0] << 16
tmp5[47:0] := tmp3[47:0] XOR tmp4[47:0]
tmp6[31:0] := tmp5[47:0] MOD2 0x11EDC6F41
dst[31:0] := tmp6[0:31] // bit reflection
Performance
crc32
unsigned int _mm_crc32_u32 (unsigned int crc, unsigned int v)
Synopsis
unsigned int _mm_crc32_u32 (unsigned int crc, unsigned int v)
#include "nmmintrin.h"
Instruction: crc32 r32, r32
CPUID Flags: SSE4.2
Description
Starting with the initial value in crc, accumulates a CRC32 value for unsigned 32-bit integer v, and stores the result in dst.
Operation
tmp1[31:0] := v[0:31] // bit reflection
tmp2[31:0] := crc[0:31] // bit reflection
tmp3[63:0] := tmp1[31:0] << 32
tmp4[63:0] := tmp2[31:0] << 32
tmp5[63:0] := tmp3[63:0] XOR tmp4[63:0]
tmp6[31:0] := tmp5[63:0] MOD2 0x11EDC6F41
dst[31:0] := tmp6[0:31] // bit reflection
Performance
crc32
unsigned __int64 _mm_crc32_u64 (unsigned __int64 crc, unsigned __int64 v)
Synopsis
unsigned __int64 _mm_crc32_u64 (unsigned __int64 crc, unsigned __int64 v)
#include "nmmintrin.h"
Instruction: crc32 r64, r64
CPUID Flags: SSE4.2
Description
Starting with the initial value in crc, accumulates a CRC32 value for unsigned 64-bit integer v, and stores the result in dst.
Operation
tmp1[63:0] := v[0:63] // bit reflection
tmp2[31:0] := crc[0:31] // bit reflection
tmp3[95:0] := tmp1[31:0] << 32
tmp4[95:0] := tmp2[63:0] << 64
tmp5[95:0] := tmp3[95:0] XOR tmp4[95:0]
tmp6[31:0] := tmp5[95:0] MOD2 0x11EDC6F41
dst[31:0] := tmp6[0:31] // bit reflection
Performance
crc32
unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v)
Synopsis
unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v)
#include "nmmintrin.h"
Instruction: crc32 r32, r8
CPUID Flags: SSE4.2
Description
Starting with the initial value in crc, accumulates a CRC32 value for unsigned 8-bit integer v, and stores the result in dst.
Operation
tmp1[7:0] := v[0:7] // bit reflection
tmp2[31:0] := crc[0:31] // bit reflection
tmp3[39:0] := tmp1[7:0] << 32
tmp4[39:0] := tmp2[31:0] << 8
tmp5[39:0] := tmp3[39:0] XOR tmp4[39:0]
tmp6[31:0] := tmp5[39:0] MOD2 0x11EDC6F41
dst[31:0] := tmp6[0:31] // bit reflection
Performance
...
__m128 _mm_csqrt_ps (__m128 a)
Synopsis
__m128 _mm_csqrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the square root of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_csqrt_ps (__m256 a)
Synopsis
__m256 _mm256_csqrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the square root of packed complex single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
cvtpi2ps
__m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
Synopsis
__m128 _mm_cvt_pi2ps (__m128 a, __m64 b)
#include "xmmintrin.h"
Instruction: cvtpi2ps xmm, mm
CPUID Flags: SSE
Description
Convert packed 32-bit integers in b to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of dst, and copy the upper 2 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[63:32] := Convert_Int32_To_FP32(b[63:32])
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
Performance
cvtps2pi
__m64 _mm_cvt_ps2pi (__m128 a)
Synopsis
__m64 _mm_cvt_ps2pi (__m128 a)
#include "xmmintrin.h"
Instruction: cvtps2pi mm, xmm
CPUID Flags: SSE
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
Performance
vcvtdq2ps
__m512 _mm512_cvt_roundepi32_ps (__m512i a, int rounding)
Synopsis
__m512 _mm512_cvt_roundepi32_ps (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtdq2ps
__m512 _mm512_mask_cvt_roundepi32_ps (__m512 src, __mmask16 k, __m512i a, int rounding)
Synopsis
__m512 _mm512_mask_cvt_roundepi32_ps (__m512 src, __mmask16 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtdq2ps
__m512 _mm512_maskz_cvt_roundepi32_ps (__mmask16 k, __m512i a, int rounding)
Synopsis
__m512 _mm512_maskz_cvt_roundepi32_ps (__mmask16 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_cvt_roundepi64_pd (__m512i a, int rounding)
Synopsis
__m512d _mm512_cvt_roundepi64_pd (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in
a to packed double-precision (64-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_mask_cvt_roundepi64_pd (__m512d src, __mmask8 k, __m512i a, int rounding)
Synopsis
__m512d _mm512_mask_cvt_roundepi64_pd (__m512d src, __mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in
a to packed double-precision (64-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_maskz_cvt_roundepi64_pd (__mmask8 k, __m512i a, int rounding)
Synopsis
__m512d _mm512_maskz_cvt_roundepi64_pd (__mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in
a to packed double-precision (64-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtqq2ps
__m256 _mm512_cvt_roundepi64_ps (__m512i a, int rounding)
Synopsis
__m256 _mm512_cvt_roundepi64_ps (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvtqq2ps
__m256 _mm512_mask_cvt_roundepi64_ps (__m256 src, __mmask8 k, __m512i a, int rounding)
Synopsis
__m256 _mm512_mask_cvt_roundepi64_ps (__m256 src, __mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:256] := 0
vcvtqq2ps
__m256 _mm512_maskz_cvt_roundepi64_ps (__mmask8 k, __m512i a, int rounding)
Synopsis
__m256 _mm512_maskz_cvt_roundepi64_ps (__mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtudq2ps
__m512 _mm512_cvt_roundepu32_ps (__m512i a, int rounding)
Synopsis
__m512 _mm512_cvt_roundepu32_ps (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_mask_cvt_roundepu32_ps (__m512 src, __mmask16 k, __m512i a, int rounding)
Synopsis
__m512 _mm512_mask_cvt_roundepu32_ps (__m512 src, __mmask16 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_maskz_cvt_roundepu32_ps (__mmask16 k, __m512i a, int rounding)
Synopsis
__m512 _mm512_maskz_cvt_roundepu32_ps (__mmask16 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_cvt_roundepu64_pd (__m512i a, int rounding)
Synopsis
__m512d _mm512_cvt_roundepu64_pd (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in
a to packed double-precision (64-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_mask_cvt_roundepu64_pd (__m512d src, __mmask8 k, __m512i a, int rounding)
Synopsis
__m512d _mm512_mask_cvt_roundepu64_pd (__m512d src, __mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in
a to packed double-precision (64-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_maskz_cvt_roundepu64_pd (__mmask8 k, __m512i a, int rounding)
Synopsis
__m512d _mm512_maskz_cvt_roundepu64_pd (__mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in
a to packed double-precision (64-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtuqq2ps
__m256 _mm512_cvt_roundepu64_ps (__m512i a, int rounding)
Synopsis
__m256 _mm512_cvt_roundepu64_ps (__m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvtuqq2ps
__m256 _mm512_mask_cvt_roundepu64_ps (__m256 src, __mmask8 k, __m512i a, int rounding)
Synopsis
__m256 _mm512_mask_cvt_roundepu64_ps (__m256 src, __mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:256] := 0
vcvtuqq2ps
__m256 _mm512_maskz_cvt_roundepu64_ps (__mmask8 k, __m512i a, int rounding)
Synopsis
__m256 _mm512_maskz_cvt_roundepu64_ps (__mmask8 k, __m512i a, int rounding)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtsi2ss
__m128 _mm_cvt_roundi32_ss (__m128 a, int b, int rounding)
Synopsis
__m128 _mm_cvt_roundi32_ss (__m128 a, int b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r32 {er}
CPUID Flags: AVX512F
Description
Convert the 32-bit integer
b to a single-precision (32-bit) floating-point element, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vcvtsi2sd
__m128d _mm_cvt_roundi64_sd (__m128d a, __int64 b, int rounding)
Synopsis
__m128d _mm_cvt_roundi64_sd (__m128d a, __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2sd xmm, xmm, r64 {er}
CPUID Flags: AVX512F
Description
Convert the 64-bit integer
b to a double-precision (64-bit) floating-point element, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_Int64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vcvtsi2ss
__m128 _mm_cvt_roundi64_ss (__m128 a, __int64 b, int rounding)
Synopsis
__m128 _mm_cvt_roundi64_ss (__m128 a, __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r64 {er}
CPUID Flags: AVX512F
Description
Convert the 64-bit integer
b to a single-precision (32-bit) floating-point element, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_Int64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vcvtpd2dq
__m256i _mm512_cvt_roundpd_epi32 (__m512d a, int rounding)
Synopsis
__m256i _mm512_cvt_roundpd_epi32 (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed 32-bit integers, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvtpd2dq
__m256i _mm512_mask_cvt_roundpd_epi32 (__m256i src, __mmask8 k, __m512d a, int rounding)
Synopsis
__m256i _mm512_mask_cvt_roundpd_epi32 (__m256i src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed 32-bit integers, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2dq
__m256i _mm512_maskz_cvt_roundpd_epi32 (__mmask8 k, __m512d a, int rounding)
Synopsis
__m256i _mm512_maskz_cvt_roundpd_epi32 (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed 32-bit integers, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2qq
__m512i _mm512_cvt_roundpd_epi64 (__m512d a, int rounding)
Synopsis
__m512i _mm512_cvt_roundpd_epi64 (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed 64-bit integers, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtpd2qq
__m512i _mm512_mask_cvt_roundpd_epi64 (__m512i src, __mmask8 k, __m512d a, int rounding)
Synopsis
__m512i _mm512_mask_cvt_roundpd_epi64 (__m512i src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed 64-bit integers, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtpd2qq
__m512i _mm512_maskz_cvt_roundpd_epi64 (__mmask8 k, __m512d a, int rounding)
Synopsis
__m512i _mm512_maskz_cvt_roundpd_epi64 (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed 64-bit integers, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtpd2udq
__m256i _mm512_cvt_roundpd_epu32 (__m512d a, int rounding)
Synopsis
__m256i _mm512_cvt_roundpd_epu32 (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed unsigned 32-bit integers, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvtpd2udq
__m256i _mm512_mask_cvt_roundpd_epu32 (__m256i src, __mmask8 k, __m512d a, int rounding)
Synopsis
__m256i _mm512_mask_cvt_roundpd_epu32 (__m256i src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed unsigned 32-bit integers, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2udq
__m256i _mm512_maskz_cvt_roundpd_epu32 (__mmask8 k, __m512d a, int rounding)
Synopsis
__m256i _mm512_maskz_cvt_roundpd_epu32 (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed unsigned 32-bit integers, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2uqq
__m512i _mm512_cvt_roundpd_epu64 (__m512d a, int rounding)
Synopsis
__m512i _mm512_cvt_roundpd_epu64 (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed unsigned 64-bit integers, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtpd2uqq
__m512i _mm512_mask_cvt_roundpd_epu64 (__m512i src, __mmask8 k, __m512d a, int rounding)
Synopsis
__m512i _mm512_mask_cvt_roundpd_epu64 (__m512i src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed unsigned 64-bit integers, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtpd2uqq
__m512i _mm512_maskz_cvt_roundpd_epu64 (__mmask8 k, __m512d a, int rounding)
Synopsis
__m512i _mm512_maskz_cvt_roundpd_epu64 (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed unsigned 64-bit integers, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtpd2ps
__m256 _mm512_cvt_roundpd_ps (__m512d a, int rounding)
Synopsis
__m256 _mm512_cvt_roundpd_ps (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvtpd2ps
__m256 _mm512_mask_cvt_roundpd_ps (__m256 src, __mmask8 k, __m512d a, int rounding)
Synopsis
__m256 _mm512_mask_cvt_roundpd_ps (__m256 src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2ps
__m256 _mm512_maskz_cvt_roundpd_ps (__mmask8 k, __m512d a, int rounding)
Synopsis
__m256 _mm512_maskz_cvt_roundpd_ps (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in
a to packed single-precision (32-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2ps
__m512 _mm512_cvt_roundpd_pslo (__m512d v2, int rounding)
Synopsis
__m512 _mm512_cvt_roundpd_pslo (__m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps zmm {k}, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in
v2 to packed single-precision (32-bit) floating-point elements, storing the results in
dst. Results are written to the lower half of
dst, and the upper half locations are set to '0'.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
k := j*32
dst[k+31:k] := Float64ToFloat32(v2[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtpd2ps
__m512 _mm512_mask_cvt_roundpd_pslo (__m512 src, __mmask8 k, __m512d v2, int rounding)
Synopsis
__m512 _mm512_mask_cvt_roundpd_pslo (__m512 src, __mmask8 k, __m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtpd2ps zmm {k}, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in
v2 to packed single-precision (32-bit) floating-point elements, storing the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). Results are written to the lower half of
dst, and the upper half locations are set to '0'.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Float64ToFloat32(v2[i+63:i])
ELSE
dst[l+31:l] := src[l+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_cvt_roundph_ps (__m256i a, int sae)
Synopsis
__m512 _mm512_cvt_roundph_ps (__m256i a, int sae)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm {sae}
CPUID Flags: AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := j*32
m := j*16
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ENDFOR
dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_mask_cvt_roundph_ps (__m512 src, __mmask16 k, __m256i a, int sae)
Synopsis
__m512 _mm512_mask_cvt_roundph_ps (__m512 src, __mmask16 k, __m256i a, int sae)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm {sae}
CPUID Flags: AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := j*32
m := j*16
IF k[j]
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_maskz_cvt_roundph_ps (__mmask16 k, __m256i a, int sae)
Synopsis
__m512 _mm512_maskz_cvt_roundph_ps (__mmask16 k, __m256i a, int sae)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm {sae}
CPUID Flags: AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := j*32
m := j*16
IF k[j]
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_cvt_roundps_epi32 (__m512 a, int rounding)
Synopsis
__m512i _mm512_cvt_roundps_epi32 (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed 32-bit integers, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_mask_cvt_roundps_epi32 (__m512i src, __mmask16 k, __m512 a, int rounding)
Synopsis
__m512i _mm512_mask_cvt_roundps_epi32 (__m512i src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed 32-bit integers, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_maskz_cvt_roundps_epi32 (__mmask16 k, __m512 a, int rounding)
Synopsis
__m512i _mm512_maskz_cvt_roundps_epi32 (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed 32-bit integers, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_cvt_roundps_epi64 (__m256 a, int rounding)
Synopsis
__m512i _mm512_cvt_roundps_epi64 (__m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed 64-bit integers, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_mask_cvt_roundps_epi64 (__m512i src, __mmask8 k, __m256 a, int rounding)
Synopsis
__m512i _mm512_mask_cvt_roundps_epi64 (__m512i src, __mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed 64-bit integers, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_maskz_cvt_roundps_epi64 (__mmask8 k, __m256 a, int rounding)
Synopsis
__m512i _mm512_maskz_cvt_roundps_epi64 (__mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed 64-bit integers, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_cvt_roundps_epu32 (__m512 a, int rounding)
Synopsis
__m512i _mm512_cvt_roundps_epu32 (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed unsigned 32-bit integers, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_mask_cvt_roundps_epu32 (__m512i src, __mmask16 k, __m512 a, int rounding)
Synopsis
__m512i _mm512_mask_cvt_roundps_epu32 (__m512i src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed unsigned 32-bit integers, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_maskz_cvt_roundps_epu32 (__mmask16 k, __m512 a, int rounding)
Synopsis
__m512i _mm512_maskz_cvt_roundps_epu32 (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed unsigned 32-bit integers, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_cvt_roundps_epu64 (__m256 a, int rounding)
Synopsis
__m512i _mm512_cvt_roundps_epu64 (__m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed unsigned 64-bit integers, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_mask_cvt_roundps_epu64 (__m512i src, __mmask8 k, __m256 a, int rounding)
Synopsis
__m512i _mm512_mask_cvt_roundps_epu64 (__m512i src, __mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed unsigned 64-bit integers, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_maskz_cvt_roundps_epu64 (__mmask8 k, __m256 a, int rounding)
Synopsis
__m512i _mm512_maskz_cvt_roundps_epu64 (__mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed unsigned 64-bit integers, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_cvt_roundps_pd (__m256 a, int sae)
Synopsis
__m512d _mm512_cvt_roundps_pd (__m256 a, int sae)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm {sae}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := 64*j
k := 32*j
dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
ENDFOR
dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_mask_cvt_roundps_pd (__m512d src, __mmask8 k, __m256 a, int sae)
Synopsis
__m512d _mm512_mask_cvt_roundps_pd (__m512d src, __mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm {sae}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_maskz_cvt_roundps_pd (__mmask8 k, __m256 a, int sae)
Synopsis
__m512d _mm512_maskz_cvt_roundps_pd (__mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm {sae}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2ph
__m128i _mm_mask_cvt_roundps_ph (__m128i src, __mmask8 k, __m128 a, int rounding)
Synopsis
__m128i _mm_mask_cvt_roundps_ph (__m128i src, __mmask8 k, __m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 3
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:64] := 0
vcvtps2ph
__m128i _mm_maskz_cvt_roundps_ph (__mmask8 k, __m128 a, int rounding)
Synopsis
__m128i _mm_maskz_cvt_roundps_ph (__mmask8 k, __m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 3
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:64] := 0
vcvtps2ph
__m128i _mm256_mask_cvt_roundps_ph (__m128i src, __mmask8 k, __m256 a, int rounding)
Synopsis
__m128i _mm256_mask_cvt_roundps_ph (__m128i src, __mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2ph
__m128i _mm256_maskz_cvt_roundps_ph (__mmask8 k, __m256 a, int rounding)
Synopsis
__m128i _mm256_maskz_cvt_roundps_ph (__mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2ph
__m256i _mm512_cvt_roundps_ph (__m512 a, int rounding)
Synopsis
__m256i _mm512_cvt_roundps_ph (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 16*j
l := 32*j
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ENDFOR
dst[MAX:256] := 0
vcvtps2ph
__m256i _mm512_mask_cvt_roundps_ph (__m256i src, __mmask16 k, __m512 a, int rounding)
Synopsis
__m256i _mm512_mask_cvt_roundps_ph (__m256i src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2ph
__m256i _mm512_maskz_cvt_roundps_ph (__mmask16 k, __m512 a, int rounding)
Synopsis
__m256i _mm512_maskz_cvt_roundps_ph (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtsd2si
int _mm_cvt_roundsd_i32 (__m128d a, int rounding)
Synopsis
int _mm_cvt_roundsd_i32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2si r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to a 32-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP64_To_Int32(a[63:0])
vcvtsd2si
__int64 _mm_cvt_roundsd_i64 (__m128d a, int rounding)
Synopsis
__int64 _mm_cvt_roundsd_i64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2si r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to a 64-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP64_To_Int64(a[63:0])
vcvtsd2si
int _mm_cvt_roundsd_si32 (__m128d a, int rounding)
Synopsis
int _mm_cvt_roundsd_si32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2si r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to a 32-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP64_To_Int32(a[63:0])
vcvtsd2si
__int64 _mm_cvt_roundsd_si64 (__m128d a, int rounding)
Synopsis
__int64 _mm_cvt_roundsd_si64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2si r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to a 64-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP64_To_Int64(a[63:0])
vcvtsd2ss
__m128 _mm_cvt_roundsd_ss (__m128 a, __m128d b, int rounding)
Synopsis
__m128 _mm_cvt_roundsd_ss (__m128 a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
b to a single-precision (32-bit) floating-point element, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP64_To_FP32(b[63:0])
dst[127:32] := a[127:31]
dst[MAX:64] := 0
vcvtsd2ss
__m128 _mm_mask_cvt_roundsd_ss (__m128 src, __mmask8 k, __m128 a, __m128d b, int rounding)
Synopsis
__m128 _mm_mask_cvt_roundsd_ss (__m128 src, __mmask8 k, __m128 a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
b to a single-precision (32-bit) floating-point element, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := Convert_FP64_To_FP32(b[63:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:31]
dst[MAX:64] := 0
vcvtsd2ss
__m128 _mm_maskz_cvt_roundsd_ss (__mmask8 k, __m128 a, __m128d b, int rounding)
Synopsis
__m128 _mm_maskz_cvt_roundsd_ss (__mmask8 k, __m128 a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
b to a single-precision (32-bit) floating-point element, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := Convert_FP64_To_FP32(b[63:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:31]
dst[MAX:64] := 0
vcvtsd2usi
unsigned int _mm_cvt_roundsd_u32 (__m128d a, int rounding)
Synopsis
unsigned int _mm_cvt_roundsd_u32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2usi r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to an unsigned 32-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])
vcvtsd2usi
unsigned __int64 _mm_cvt_roundsd_u64 (__m128d a, int rounding)
Synopsis
unsigned __int64 _mm_cvt_roundsd_u64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvtsd2usi r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to an unsigned 64-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])
vcvtsi2ss
__m128 _mm_cvt_roundsi32_ss (__m128 a, int b, int rounding)
Synopsis
__m128 _mm_cvt_roundsi32_ss (__m128 a, int b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r32 {er}
CPUID Flags: AVX512F
Description
Convert the 32-bit integer
b to a single-precision (32-bit) floating-point element, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vcvtsi2sd
__m128d _mm_cvt_roundsi64_sd (__m128d a, __int64 b, int rounding)
Synopsis
__m128d _mm_cvt_roundsi64_sd (__m128d a, __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2sd xmm, xmm, r64 {er}
CPUID Flags: AVX512F
Description
Convert the 64-bit integer
b to a double-precision (64-bit) floating-point element, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_Int64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vcvtsi2ss
__m128 _mm_cvt_roundsi64_ss (__m128 a, __int64 b, int rounding)
Synopsis
__m128 _mm_cvt_roundsi64_ss (__m128 a, __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r64 {er}
CPUID Flags: AVX512F
Description
Convert the 64-bit integer
b to a single-precision (32-bit) floating-point element, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_Int64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vcvtss2si
int _mm_cvt_roundss_i32 (__m128 a, int rounding)
Synopsis
int _mm_cvt_roundss_i32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2si r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to a 32-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP32_To_Int32(a[31:0])
vcvtss2si
__int64 _mm_cvt_roundss_i64 (__m128 a, int rounding)
Synopsis
__int64 _mm_cvt_roundss_i64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2si r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to a 64-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP32_To_Int64(a[31:0])
vcvtss2sd
__m128d _mm_cvt_roundss_sd (__m128d a, __m128 b, int rounding)
Synopsis
__m128d _mm_cvt_roundss_sd (__m128d a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
b to a double-precision (64-bit) floating-point element, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:64] := 0
vcvtss2sd
__m128d _mm_mask_cvt_roundss_sd (__m128d src, __mmask8 k, __m128d a, __m128 b, int rounding)
Synopsis
__m128d _mm_mask_cvt_roundss_sd (__m128d src, __mmask8 k, __m128d a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
b to a double-precision (64-bit) floating-point element, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := Convert_FP32_To_FP64(b[31:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:64] := 0
vcvtss2sd
__m128d _mm_maskz_cvt_roundss_sd (__mmask8 k, __m128d a, __m128 b, int rounding)
Synopsis
__m128d _mm_maskz_cvt_roundss_sd (__mmask8 k, __m128d a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
b to a double-precision (64-bit) floating-point element, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := Convert_FP32_To_FP64(b[31:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:64] := 0
vcvtss2si
int _mm_cvt_roundss_si32 (__m128 a, int rounding)
Synopsis
int _mm_cvt_roundss_si32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2si r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to a 32-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP32_To_Int32(a[31:0])
vcvtss2si
__int64 _mm_cvt_roundss_si64 (__m128 a, int rounding)
Synopsis
__int64 _mm_cvt_roundss_si64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2si r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to a 64-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP32_To_Int64(a[31:0])
vcvtss2usi
unsigned int _mm_cvt_roundss_u32 (__m128 a, int rounding)
Synopsis
unsigned int _mm_cvt_roundss_u32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2usi r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to an unsigned 32-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])
vcvtss2usi
unsigned __int64 _mm_cvt_roundss_u64 (__m128 a, int rounding)
Synopsis
unsigned __int64 _mm_cvt_roundss_u64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtss2usi r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to an unsigned 64-bit integer, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])
vcvtusi2ss
__m128 _mm_cvt_roundu32_ss (__m128 a, unsigned int b, int rounding)
Synopsis
__m128 _mm_cvt_roundu32_ss (__m128 a, unsigned int b, int rounding)
#include "immintrin.h"
Instruction: vcvtusi2ss xmm, xmm, r32 {er}
CPUID Flags: AVX512F
Description
Convert the unsigned 32-bit integer
b to a single-precision (32-bit) floating-point element, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vcvtusi2sd
__m128d _mm_cvt_roundu64_sd (__m128d a, unsigned __int64 b, int rounding)
Synopsis
__m128d _mm_cvt_roundu64_sd (__m128d a, unsigned __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtusi2sd xmm, xmm, r64 {er}
CPUID Flags: AVX512F
Description
Convert the unsigned 64-bit integer
b to a double-precision (64-bit) floating-point element, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vcvtusi2ss
__m128 _mm_cvt_roundu64_ss (__m128 a, unsigned __int64 b, int rounding)
Synopsis
__m128 _mm_cvt_roundu64_ss (__m128 a, unsigned __int64 b, int rounding)
#include "immintrin.h"
Instruction: vcvtusi2ss xmm, xmm, r64 {er}
CPUID Flags: AVX512F
Description
Convert the unsigned 64-bit integer
b to a single-precision (32-bit) floating-point element, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
cvtsi2ss
__m128 _mm_cvt_si2ss (__m128 a, int b)
Synopsis
__m128 _mm_cvt_si2ss (__m128 a, int b)
#include "xmmintrin.h"
Instruction: cvtsi2ss xmm, r32
CPUID Flags: SSE
Description
Convert the 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
Performance
cvtss2si
int _mm_cvt_ss2si (__m128 a)
Synopsis
int _mm_cvt_ss2si (__m128 a)
#include "xmmintrin.h"
Instruction: cvtss2si r32, xmm
CPUID Flags: SSE
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
Operation
dst[31:0] := Convert_FP32_To_Int32(a[31:0])
pmovsxwd
__m128i _mm_cvtepi16_epi32 (__m128i a)
Synopsis
__m128i _mm_cvtepi16_epi32 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxwd xmm, xmm
CPUID Flags: SSE4.1
Description
Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 16*j
dst[i+31:i] := SignExtend(a[k+15:k])
ENDFOR
Performance
vpmovsxwd
__m128i _mm_mask_cvtepi16_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi16_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
l := j*16
IF k[j]
dst[i+31:i] := SignExtend(a[l+15:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxwd
__m128i _mm_maskz_cvtepi16_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi16_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
dst[i+31:i] := SignExtend(a[l+15:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxwd
__m256i _mm256_cvtepi16_epi32 (__m128i a)
Synopsis
__m256i _mm256_cvtepi16_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd ymm, xmm
CPUID Flags: AVX2
Description
Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j:= 0 to 7
i := 32*j
k := 16*j
dst[i+31:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovsxwd
__m256i _mm256_mask_cvtepi16_epi32 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepi16_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
l := j*16
IF k[j]
dst[i+31:i] := SignExtend(a[l+15:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxwd
__m256i _mm256_maskz_cvtepi16_epi32 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepi16_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwd
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
dst[i+31:i] := SignExtend(a[l+15:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxwd
__m512i _mm512_cvtepi16_epi32 (__m256i a)
Synopsis
__m512i _mm512_cvtepi16_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsxwd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 16*j
dst[i+31:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0
vpmovsxwd
__m512i _mm512_mask_cvtepi16_epi32 (__m512i src, __mmask16 k, __m256i a)
Synopsis
__m512i _mm512_mask_cvtepi16_epi32 (__m512i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxwd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
l := j*16
IF k[j]
dst[i+31:i] := SignExtend(a[l+15:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovsxwd
__m512i _mm512_maskz_cvtepi16_epi32 (__mmask16 k, __m256i a)
Synopsis
__m512i _mm512_maskz_cvtepi16_epi32 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxwd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
dst[i+31:i] := SignExtend(a[l+15:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmovsxwq
__m128i _mm_cvtepi16_epi64 (__m128i a)
Synopsis
__m128i _mm_cvtepi16_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxwq xmm, xmm
CPUID Flags: SSE4.1
Description
Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 16*j
dst[i+63:i] := SignExtend(a[k+15:k])
ENDFOR
Performance
vpmovsxwq
__m128i _mm_mask_cvtepi16_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi16_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+15:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxwq
__m128i _mm_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+15:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxwq
__m256i _mm256_cvtepi16_epi64 (__m128i a)
Synopsis
__m256i _mm256_cvtepi16_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq ymm, xmm
CPUID Flags: AVX2
Description
Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j:= 0 to 3
i := 64*j
k := 16*j
dst[i+63:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovsxwq
__m256i _mm256_mask_cvtepi16_epi64 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepi16_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+15:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxwq
__m256i _mm256_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+15:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxwq
__m512i _mm512_cvtepi16_epi64 (__m128i a)
Synopsis
__m512i _mm512_cvtepi16_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 16*j
dst[i+63:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0
vpmovsxwq
__m512i _mm512_mask_cvtepi16_epi64 (__m512i src, __mmask8 k, __m128i a)
Synopsis
__m512i _mm512_mask_cvtepi16_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+15:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovsxwq
__m512i _mm512_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)
Synopsis
__m512i _mm512_maskz_cvtepi16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxwq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+15:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmovwb
__m128i _mm_cvtepi16_epi8 (__m128i a)
Synopsis
__m128i _mm_cvtepi16_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:64] := 0
vpmovwb
__m128i _mm_mask_cvtepi16_epi8 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi16_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovwb
__m128i _mm_maskz_cvtepi16_epi8 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi16_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovwb
__m128i _mm256_cvtepi16_epi8 (__m256i a)
Synopsis
__m128i _mm256_cvtepi16_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:128] := 0
vpmovwb
__m128i _mm256_mask_cvtepi16_epi8 (__m128i src, __mmask16 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtepi16_epi8 (__m128i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovwb
__m128i _mm256_maskz_cvtepi16_epi8 (__mmask16 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtepi16_epi8 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovwb
__m256i _mm512_cvtepi16_epi8 (__m512i a)
Synopsis
__m256i _mm512_cvtepi16_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:256] := 0
vpmovwb
__m256i _mm512_mask_cvtepi16_epi8 (__m256i src, __mmask32 k, __m512i a)
Synopsis
__m256i _mm512_mask_cvtepi16_epi8 (__m256i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovwb
__m256i _mm512_maskz_cvtepi16_epi8 (__mmask32 k, __m512i a)
Synopsis
__m256i _mm512_maskz_cvtepi16_epi8 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovwb
void _mm_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Truncate_Int16_To_Int8(a[i+15:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovwb
void _mm256_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)
Synopsis
void _mm256_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Truncate_Int16_To_Int8(a[i+15:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovwb
void _mm512_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)
Synopsis
void _mm512_mask_cvtepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovwb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Truncate_Int16_To_Int8(a[i+15:i])
FI
ENDFOR
dst[MAX:256] := 0
vpmovdw
__m128i _mm_cvtepi32_epi16 (__m128i a)
Synopsis
__m128i _mm_cvtepi32_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 16*j
dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:64] := 0
vpmovdw
__m128i _mm_mask_cvtepi32_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi32_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovdw
__m128i _mm_maskz_cvtepi32_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi32_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovdw
__m128i _mm256_cvtepi32_epi16 (__m256i a)
Synopsis
__m128i _mm256_cvtepi32_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 16*j
dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vpmovdw
__m128i _mm256_mask_cvtepi32_epi16 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtepi32_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovdw
__m128i _mm256_maskz_cvtepi32_epi16 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtepi32_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovdw
__m256i _mm512_cvtepi32_epi16 (__m512i a)
Synopsis
__m256i _mm512_cvtepi32_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovdw ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 16*j
dst[k+15:k] := Truncate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vpmovdw
__m256i _mm512_mask_cvtepi32_epi16 (__m256i src, __mmask16 k, __m512i a)
Synopsis
__m256i _mm512_mask_cvtepi32_epi16 (__m256i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdw ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovdw
__m256i _mm512_maskz_cvtepi32_epi16 (__mmask16 k, __m512i a)
Synopsis
__m256i _mm512_maskz_cvtepi32_epi16 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdw ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
pmovsxdq
__m128i _mm_cvtepi32_epi64 (__m128i a)
Synopsis
__m128i _mm_cvtepi32_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxdq xmm, xmm
CPUID Flags: SSE4.1
Description
Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 32*j
dst[i+63:i] := SignExtend(a[k+31:k])
ENDFOR
Performance
vpmovsxdq
__m128i _mm_mask_cvtepi32_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi32_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxdq
__m128i _mm_maskz_cvtepi32_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi32_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxdq
__m256i _mm256_cvtepi32_epi64 (__m128i a)
Synopsis
__m256i _mm256_cvtepi32_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq ymm, xmm
CPUID Flags: AVX2
Description
Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j:= 0 to 3
i := 64*j
k := 32*j
dst[i+63:i] := SignExtend(a[k+31:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovsxdq
__m256i _mm256_mask_cvtepi32_epi64 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepi32_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxdq
__m256i _mm256_maskz_cvtepi32_epi64 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepi32_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxdq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxdq
__m512i _mm512_cvtepi32_epi64 (__m256i a)
Synopsis
__m512i _mm512_cvtepi32_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsxdq zmm {k}, ymm
CPUID Flags: AVX512F
Description
Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 32*j
dst[i+63:i] := SignExtend(a[k+31:k])
ENDFOR
dst[MAX:512] := 0
vpmovsxdq
__m512i _mm512_mask_cvtepi32_epi64 (__m512i src, __mmask8 k, __m256i a)
Synopsis
__m512i _mm512_mask_cvtepi32_epi64 (__m512i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxdq zmm {k}, ymm
CPUID Flags: AVX512F
Description
Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovsxdq
__m512i _mm512_maskz_cvtepi32_epi64 (__mmask8 k, __m256i a)
Synopsis
__m512i _mm512_maskz_cvtepi32_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxdq zmm {k}, ymm
CPUID Flags: AVX512F
Description
Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmovdb
__m128i _mm_cvtepi32_epi8 (__m128i a)
Synopsis
__m128i _mm_cvtepi32_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 8*j
dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:32] := 0
vpmovdb
__m128i _mm_mask_cvtepi32_epi8 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi32_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:32] := 0
vpmovdb
__m128i _mm_maskz_cvtepi32_epi8 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi32_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:32] := 0
vpmovdb
__m128i _mm256_cvtepi32_epi8 (__m256i a)
Synopsis
__m128i _mm256_cvtepi32_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 8*j
dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:64] := 0
vpmovdb
__m128i _mm256_mask_cvtepi32_epi8 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtepi32_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovdb
__m128i _mm256_maskz_cvtepi32_epi8 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtepi32_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovdb
__m128i _mm512_cvtepi32_epi8 (__m512i a)
Synopsis
__m128i _mm512_cvtepi32_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovdb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 8*j
dst[k+7:k] := Truncate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vpmovdb
__m128i _mm512_mask_cvtepi32_epi8 (__m128i src, __mmask16 k, __m512i a)
Synopsis
__m128i _mm512_mask_cvtepi32_epi8 (__m128i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovdb
__m128i _mm512_maskz_cvtepi32_epi8 (__mmask16 k, __m512i a)
Synopsis
__m128i _mm512_maskz_cvtepi32_epi8 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
cvtdq2pd
__m128d _mm_cvtepi32_pd (__m128i a)
Synopsis
__m128d _mm_cvtepi32_pd (__m128i a)
#include "emmintrin.h"
Instruction: cvtdq2pd xmm, xmm
CPUID Flags: SSE2
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*32
m := j*64
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ENDFOR
Performance
vcvtdq2pd
__m128d _mm_mask_cvtepi32_pd (__m128d src, __mmask8 k, __m128i a)
Synopsis
__m128d _mm_mask_cvtepi32_pd (__m128d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*32
m := j*64
IF k[j]
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ELSE
dst[m+63:m] := src[m+63:m]
FI
ENDFOR
dst[MAX:128] := 0
vcvtdq2pd
__m128d _mm_maskz_cvtepi32_pd (__mmask8 k, __m128i a)
Synopsis
__m128d _mm_maskz_cvtepi32_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*32
m := j*64
IF k[j]
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ELSE
dst[m+63:m] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtdq2pd
__m256d _mm256_cvtepi32_pd (__m128i a)
Synopsis
__m256d _mm256_cvtepi32_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd ymm, xmm
CPUID Flags: AVX
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
m := j*64
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vcvtdq2pd
__m256d _mm256_mask_cvtepi32_pd (__m256d src, __mmask8 k, __m128i a)
Synopsis
__m256d _mm256_mask_cvtepi32_pd (__m256d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
m := j*64
IF k[j]
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ELSE
dst[m+63:m] := src[m+63:m]
FI
ENDFOR
dst[MAX:256] := 0
vcvtdq2pd
__m256d _mm256_maskz_cvtepi32_pd (__mmask8 k, __m128i a)
Synopsis
__m256d _mm256_maskz_cvtepi32_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
m := j*64
IF k[j]
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ELSE
dst[m+63:m] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtdq2pd
__m512d _mm512_cvtepi32_pd (__m256i a)
Synopsis
__m512d _mm512_cvtepi32_pd (__m256i a)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
m := j*64
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtdq2pd
__m512d _mm512_mask_cvtepi32_pd (__m512d src, __mmask8 k, __m256i a)
Synopsis
__m512d _mm512_mask_cvtepi32_pd (__m512d src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
m := j*64
IF k[j]
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ELSE
dst[m+63:m] := src[m+63:m]
FI
ENDFOR
dst[MAX:512] := 0
vcvtdq2pd
__m512d _mm512_maskz_cvtepi32_pd (__mmask8 k, __m256i a)
Synopsis
__m512d _mm512_maskz_cvtepi32_pd (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
m := j*64
IF k[j]
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ELSE
dst[m+63:m] := 0
FI
ENDFOR
dst[MAX:512] := 0
cvtdq2ps
__m128 _mm_cvtepi32_ps (__m128i a)
Synopsis
__m128 _mm_cvtepi32_ps (__m128i a)
#include "emmintrin.h"
Instruction: cvtdq2ps xmm, xmm
CPUID Flags: SSE2
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ENDFOR
Performance
vcvtdq2ps
__m128 _mm_mask_cvtepi32_ps (__m128 src, __mmask8 k, __m128i a)
Synopsis
__m128 _mm_mask_cvtepi32_ps (__m128 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtdq2ps
__m128 _mm_maskz_cvtepi32_ps (__mmask8 k, __m128i a)
Synopsis
__m128 _mm_maskz_cvtepi32_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtdq2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
IF k[j]
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtdq2ps
__m256 _mm256_cvtepi32_ps (__m256i a)
Synopsis
__m256 _mm256_cvtepi32_ps (__m256i a)
#include "immintrin.h"
Instruction: vcvtdq2ps ymm, ymm
CPUID Flags: AVX
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vcvtdq2ps
__m256 _mm256_mask_cvtepi32_ps (__m256 src, __mmask8 k, __m256i a)
Synopsis
__m256 _mm256_mask_cvtepi32_ps (__m256 src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtdq2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtdq2ps
__m256 _mm256_maskz_cvtepi32_ps (__mmask8 k, __m256i a)
Synopsis
__m256 _mm256_maskz_cvtepi32_ps (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtdq2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
IF k[j]
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtdq2ps
__m512 _mm512_cvtepi32_ps (__m512i a)
Synopsis
__m512 _mm512_cvtepi32_ps (__m512i a)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtdq2ps
__m512 _mm512_mask_cvtepi32_ps (__m512 src, __mmask16 k, __m512i a)
Synopsis
__m512 _mm512_mask_cvtepi32_ps (__m512 src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtdq2ps
__m512 _mm512_maskz_cvtepi32_ps (__mmask16 k, __m512i a)
Synopsis
__m512 _mm512_maskz_cvtepi32_ps (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtdq2ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_Int32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmovdw
void _mm_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Truncate_Int32_To_Int16(a[i+31:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovdw
void _mm256_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Truncate_Int32_To_Int16(a[i+31:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovdw
void _mm512_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)
Synopsis
void _mm512_mask_cvtepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdw m256 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Truncate_Int32_To_Int16(a[i+31:i])
FI
ENDFOR
vpmovdb
void _mm_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Truncate_Int32_To_Int8(a[i+31:i])
FI
ENDFOR
dst[MAX:32] := 0
vpmovdb
void _mm256_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Truncate_Int32_To_Int8(a[i+31:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovdb
void _mm512_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)
Synopsis
void _mm512_mask_cvtepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovdb m128 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Truncate_Int32_To_Int8(a[i+31:i])
FI
ENDFOR
vcvtdq2pd
__m512d _mm512_cvtepi32lo_pd (__m512i v2)
Synopsis
__m512d _mm512_cvtepi32lo_pd (__m512i v2)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*32
l := j*64
dst[l+63:l] := Int32ToFloat64(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtdq2pd
__m512d _mm512_mask_cvtepi32lo_pd (__m512d src, __mmask8 k, __m512i v2)
Synopsis
__m512d _mm512_mask_cvtepi32lo_pd (__m512d src, __mmask8 k, __m512i v2)
#include "immintrin.h"
Instruction: vcvtdq2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
n := j*64
IF k[j]
dst[k+63:k] := Int32ToFloat64(v2[i+31:i])
ELSE
dst[n+63:n] := src[n+63:n]
FI
ENDFOR
dst[MAX:512] := 0
vpmovqw
__m128i _mm_cvtepi64_epi16 (__m128i a)
Synopsis
__m128i _mm_cvtepi64_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 16*j
dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:32] := 0
vpmovqw
__m128i _mm_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:32] := 0
vpmovqw
__m128i _mm_maskz_cvtepi64_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi64_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:32] := 0
vpmovqw
__m128i _mm256_cvtepi64_epi16 (__m256i a)
Synopsis
__m128i _mm256_cvtepi64_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 16*j
dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vpmovqw
__m128i _mm256_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovqw
__m128i _mm256_maskz_cvtepi64_epi16 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtepi64_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovqw
__m128i _mm512_cvtepi64_epi16 (__m512i a)
Synopsis
__m128i _mm512_cvtepi64_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovqw xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 16*j
dst[k+15:k] := Truncate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpmovqw
__m128i _mm512_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m512i a)
Synopsis
__m128i _mm512_mask_cvtepi64_epi16 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqw xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovqw
__m128i _mm512_maskz_cvtepi64_epi16 (__mmask8 k, __m512i a)
Synopsis
__m128i _mm512_maskz_cvtepi64_epi16 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqw xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Truncate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovqd
__m128i _mm_cvtepi64_epi32 (__m128i a)
Synopsis
__m128i _mm_cvtepi64_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 32*j
dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vpmovqd
__m128i _mm_mask_cvtepi64_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi64_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovqd
__m128i _mm_maskz_cvtepi64_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi64_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovqd
__m128i _mm256_cvtepi64_epi32 (__m256i a)
Synopsis
__m128i _mm256_cvtepi64_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 32*j
dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpmovqd
__m128i _mm256_mask_cvtepi64_epi32 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtepi64_epi32 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovqd
__m128i _mm256_maskz_cvtepi64_epi32 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtepi64_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovqd
__m256i _mm512_cvtepi64_epi32 (__m512i a)
Synopsis
__m256i _mm512_cvtepi64_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpmovqd ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 32*j
dst[k+31:k] := Truncate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vpmovqd
__m256i _mm512_mask_cvtepi64_epi32 (__m256i src, __mmask8 k, __m512i a)
Synopsis
__m256i _mm512_mask_cvtepi64_epi32 (__m256i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqd ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovqd
__m256i _mm512_maskz_cvtepi64_epi32 (__mmask8 k, __m512i a)
Synopsis
__m256i _mm512_maskz_cvtepi64_epi32 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqd ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Truncate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovqb
__m128i _mm_cvtepi64_epi8 (__m128i a)
Synopsis
__m128i _mm_cvtepi64_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 8*j
dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpmovqb
__m128i _mm_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovqb
__m128i _mm_maskz_cvtepi64_epi8 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi64_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovqb
__m128i _mm256_cvtepi64_epi8 (__m256i a)
Synopsis
__m128i _mm256_cvtepi64_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 8*j
dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpmovqb
__m128i _mm256_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovqb
__m128i _mm256_maskz_cvtepi64_epi8 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtepi64_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovqb
__m128i _mm512_cvtepi64_epi8 (__m512i a)
Synopsis
__m128i _mm512_cvtepi64_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovqb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 8*j
dst[k+7:k] := Truncate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpmovqb
__m128i _mm512_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m512i a)
Synopsis
__m128i _mm512_mask_cvtepi64_epi8 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovqb
__m128i _mm512_maskz_cvtepi64_epi8 (__mmask8 k, __m512i a)
Synopsis
__m128i _mm512_maskz_cvtepi64_epi8 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Truncate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtqq2pd
__m128d _mm_cvtepi64_pd (__m128i a)
Synopsis
__m128d _mm_cvtepi64_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vcvtqq2pd
__m128d _mm_mask_cvtepi64_pd (__m128d src, __mmask8 k, __m128i a)
Synopsis
__m128d _mm_mask_cvtepi64_pd (__m128d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtqq2pd
__m128d _mm_maskz_cvtepi64_pd (__mmask8 k, __m128i a)
Synopsis
__m128d _mm_maskz_cvtepi64_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtqq2pd
__m256d _mm256_cvtepi64_pd (__m256i a)
Synopsis
__m256d _mm256_cvtepi64_pd (__m256i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvtqq2pd
__m256d _mm256_mask_cvtepi64_pd (__m256d src, __mmask8 k, __m256i a)
Synopsis
__m256d _mm256_mask_cvtepi64_pd (__m256d src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtqq2pd
__m256d _mm256_maskz_cvtepi64_pd (__mmask8 k, __m256i a)
Synopsis
__m256d _mm256_maskz_cvtepi64_pd (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtqq2pd
__m512d _mm512_cvtepi64_pd (__m512i a)
Synopsis
__m512d _mm512_cvtepi64_pd (__m512i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_mask_cvtepi64_pd (__m512d src, __mmask8 k, __m512i a)
Synopsis
__m512d _mm512_mask_cvtepi64_pd (__m512d src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtqq2pd
__m512d _mm512_maskz_cvtepi64_pd (__mmask8 k, __m512i a)
Synopsis
__m512d _mm512_maskz_cvtepi64_pd (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_Int64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtqq2ps
__m128 _mm_cvtepi64_ps (__m128i a)
Synopsis
__m128 _mm_cvtepi64_ps (__m128i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vcvtqq2ps
__m128 _mm_mask_cvtepi64_ps (__m128 src, __mmask8 k, __m128i a)
Synopsis
__m128 _mm_mask_cvtepi64_ps (__m128 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:64] := 0
vcvtqq2ps
__m128 _mm_maskz_cvtepi64_ps (__mmask8 k, __m128i a)
Synopsis
__m128 _mm_maskz_cvtepi64_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vcvtqq2ps
__m128 _mm256_cvtepi64_ps (__m256i a)
Synopsis
__m128 _mm256_cvtepi64_ps (__m256i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vcvtqq2ps
__m128 _mm256_mask_cvtepi64_ps (__m128 src, __mmask8 k, __m256i a)
Synopsis
__m128 _mm256_mask_cvtepi64_ps (__m128 src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:128] := 0
vcvtqq2ps
__m128 _mm256_maskz_cvtepi64_ps (__mmask8 k, __m256i a)
Synopsis
__m128 _mm256_maskz_cvtepi64_ps (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtqq2ps
__m256 _mm512_cvtepi64_ps (__m512i a)
Synopsis
__m256 _mm512_cvtepi64_ps (__m512i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvtqq2ps
__m256 _mm512_mask_cvtepi64_ps (__m256 src, __mmask8 k, __m512i a)
Synopsis
__m256 _mm512_mask_cvtepi64_ps (__m256 src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:256] := 0
vcvtqq2ps
__m256 _mm512_maskz_cvtepi64_ps (__mmask8 k, __m512i a)
Synopsis
__m256 _mm512_maskz_cvtepi64_ps (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Convert_Int64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovqw
void _mm_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Truncate_Int64_To_Int16(a[i+63:i])
FI
ENDFOR
dst[MAX:32] := 0
vpmovqw
void _mm256_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Truncate_Int64_To_Int16(a[i+63:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovqw
void _mm512_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_cvtepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqw m128 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Truncate_Int64_To_Int16(a[i+63:i])
FI
ENDFOR
vpmovqd
void _mm_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
MEM[base_addr+l+31:base_addr+l] := Truncate_Int64_To_Int32(a[i+63:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovqd
void _mm256_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
MEM[base_addr+l+31:base_addr+l] := Truncate_Int64_To_Int32(a[i+63:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovqd
void _mm512_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_cvtepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqd m256 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
MEM[base_addr+l+31:base_addr+l] := Truncate_Int64_To_Int32(a[i+63:i])
FI
ENDFOR
vpmovqb
void _mm_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Truncate_Int64_To_Int8(a[i+63:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovqb
void _mm256_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Truncate_Int64_To_Int8(a[i+63:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovqb
void _mm512_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_cvtepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovqb m64 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Truncate_Int64_To_Int8(a[i+63:i])
FI
ENDFOR
pmovsxbw
__m128i _mm_cvtepi8_epi16 (__m128i a)
Synopsis
__m128i _mm_cvtepi8_epi16 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxbw xmm, xmm
CPUID Flags: SSE4.1
Description
Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*8
l := j*16
dst[l+15:l] := SignExtend(a[i+7:i])
ENDFOR
Performance
vpmovsxbw
__m128i _mm_mask_cvtepi8_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi8_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512VL + AVX512BW
Description
Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := SignExtend(a[i+7:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxbw
__m128i _mm_maskz_cvtepi8_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi8_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512VL + AVX512BW
Description
Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := SignExtend(a[i+7:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxbw
__m256i _mm256_cvtepi8_epi16 (__m128i a)
Synopsis
__m256i _mm256_cvtepi8_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw ymm, xmm
CPUID Flags: AVX2
Description
Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
l := j*16
dst[l+15:l] := SignExtend(a[i+7:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovsxbw
__m256i _mm256_mask_cvtepi8_epi16 (__m256i src, __mmask16 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepi8_epi16 (__m256i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512VL + AVX512BW
Description
Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := SignExtend(a[i+7:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxbw
__m256i _mm256_maskz_cvtepi8_epi16 (__mmask16 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepi8_epi16 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512VL + AVX512BW
Description
Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := SignExtend(a[i+7:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxbw
__m512i _mm512_cvtepi8_epi16 (__m256i a)
Synopsis
__m512i _mm512_cvtepi8_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512BW
Description
Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
l := j*16
dst[l+15:l] := SignExtend(a[i+7:i])
ENDFOR
dst[MAX:512] := 0
vpmovsxbw
__m512i _mm512_mask_cvtepi8_epi16 (__m512i src, __mmask32 k, __m256i a)
Synopsis
__m512i _mm512_mask_cvtepi8_epi16 (__m512i src, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512BW
Description
Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := SignExtend(a[i+7:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:512] := 0
vpmovsxbw
__m512i _mm512_maskz_cvtepi8_epi16 (__mmask32 k, __m256i a)
Synopsis
__m512i _mm512_maskz_cvtepi8_epi16 (__mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsxbw
CPUID Flags: AVX512BW
Description
Sign extend packed 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := SignExtend(a[i+7:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmovsxbd
__m128i _mm_cvtepi8_epi32 (__m128i a)
Synopsis
__m128i _mm_cvtepi8_epi32 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxbd xmm, xmm
CPUID Flags: SSE4.1
Description
Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 8*j
dst[i+31:i] := SignExtend(a[k+7:k])
ENDFOR
Performance
vpmovsxbd
__m128i _mm_mask_cvtepi8_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi8_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := SignExtend(a[l+7:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxbd
__m128i _mm_maskz_cvtepi8_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi8_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := SignExtend(a[l+7:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxbd
__m256i _mm256_cvtepi8_epi32 (__m128i a)
Synopsis
__m256i _mm256_cvtepi8_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd ymm, xmm
CPUID Flags: AVX2
Description
Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 8*j
dst[i+31:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovsxbd
__m256i _mm256_mask_cvtepi8_epi32 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepi8_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := SignExtend(a[l+7:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxbd
__m256i _mm256_maskz_cvtepi8_epi32 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepi8_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := SignExtend(a[l+7:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxbd
__m512i _mm512_cvtepi8_epi32 (__m128i a)
Synopsis
__m512i _mm512_cvtepi8_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 8*j
dst[i+31:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0
vpmovsxbd
__m512i _mm512_mask_cvtepi8_epi32 (__m512i src, __mmask16 k, __m128i a)
Synopsis
__m512i _mm512_mask_cvtepi8_epi32 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := SignExtend(a[l+7:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovsxbd
__m512i _mm512_maskz_cvtepi8_epi32 (__mmask16 k, __m128i a)
Synopsis
__m512i _mm512_maskz_cvtepi8_epi32 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := SignExtend(a[l+7:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmovsxbq
__m128i _mm_cvtepi8_epi64 (__m128i a)
Synopsis
__m128i _mm_cvtepi8_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovsxbq xmm, xmm
CPUID Flags: SSE4.1
Description
Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 8*j
dst[i+63:i] := SignExtend(a[k+7:k])
ENDFOR
Performance
vpmovsxbq
__m128i _mm_mask_cvtepi8_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepi8_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+7:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxbq
__m128i _mm_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+7:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsxbq
__m256i _mm256_cvtepi8_epi64 (__m128i a)
Synopsis
__m256i _mm256_cvtepi8_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq ymm, xmm
CPUID Flags: AVX2
Description
Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 8*j
dst[i+63:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovsxbq
__m256i _mm256_mask_cvtepi8_epi64 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepi8_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+7:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxbq
__m256i _mm256_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq
CPUID Flags: AVX512VL + AVX512F
Description
Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+7:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovsxbq
__m512i _mm512_cvtepi8_epi64 (__m128i a)
Synopsis
__m512i _mm512_cvtepi8_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 8*j
dst[i+63:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0
vpmovsxbq
__m512i _mm512_mask_cvtepi8_epi64 (__m512i src, __mmask8 k, __m128i a)
Synopsis
__m512i _mm512_mask_cvtepi8_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+7:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovsxbq
__m512i _mm512_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)
Synopsis
__m512i _mm512_maskz_cvtepi8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsxbq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := SignExtend(a[l+7:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmovzxwd
__m128i _mm_cvtepu16_epi32 (__m128i a)
Synopsis
__m128i _mm_cvtepu16_epi32 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxwd xmm, xmm
CPUID Flags: SSE4.1
Description
Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 16*j
dst[i+31:i] := ZeroExtend(a[k+15:k])
ENDFOR
Performance
vpmovzxwd
__m128i _mm_mask_cvtepu16_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepu16_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxwd
__m128i _mm_maskz_cvtepu16_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepu16_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxwd
__m256i _mm256_cvtepu16_epi32 (__m128i a)
Synopsis
__m256i _mm256_cvtepu16_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd ymm, xmm
CPUID Flags: AVX2
Description
Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 16*j
dst[i+31:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovzxwd
__m256i _mm256_mask_cvtepu16_epi32 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepu16_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxwd
__m256i _mm256_maskz_cvtepu16_epi32 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepu16_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwd
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxwd
__m512i _mm512_cvtepu16_epi32 (__m256i a)
Synopsis
__m512i _mm512_cvtepu16_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovzxwd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 16*j
dst[i+31:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0
vpmovzxwd
__m512i _mm512_mask_cvtepu16_epi32 (__m512i src, __mmask16 k, __m256i a)
Synopsis
__m512i _mm512_mask_cvtepu16_epi32 (__m512i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxwd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovzxwd
__m512i _mm512_maskz_cvtepu16_epi32 (__mmask16 k, __m256i a)
Synopsis
__m512i _mm512_maskz_cvtepu16_epi32 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxwd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmovzxwq
__m128i _mm_cvtepu16_epi64 (__m128i a)
Synopsis
__m128i _mm_cvtepu16_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxwq xmm, xmm
CPUID Flags: SSE4.1
Description
Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 16*j
dst[i+63:i] := ZeroExtend(a[k+15:k])
ENDFOR
Performance
vpmovzxwq
__m128i _mm_mask_cvtepu16_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepu16_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxwq
__m128i _mm_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxwq
__m256i _mm256_cvtepu16_epi64 (__m128i a)
Synopsis
__m256i _mm256_cvtepu16_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq ymm, xmm
CPUID Flags: AVX2
Description
Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j:= 0 to 3
i := 64*j
k := 16*j
dst[i+63:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovzxwq
__m256i _mm256_mask_cvtepu16_epi64 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepu16_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxwq
__m256i _mm256_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxwq
__m512i _mm512_cvtepu16_epi64 (__m128i a)
Synopsis
__m512i _mm512_cvtepu16_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 16*j
dst[i+63:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:512] := 0
vpmovzxwq
__m512i _mm512_mask_cvtepu16_epi64 (__m512i src, __mmask8 k, __m128i a)
Synopsis
__m512i _mm512_mask_cvtepu16_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovzxwq
__m512i _mm512_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)
Synopsis
__m512i _mm512_maskz_cvtepu16_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxwq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+15:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmovzxdq
__m128i _mm_cvtepu32_epi64 (__m128i a)
Synopsis
__m128i _mm_cvtepu32_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxdq xmm, xmm
CPUID Flags: SSE4.1
Description
Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 32*j
dst[i+63:i] := ZeroExtend(a[k+31:k])
ENDFOR
Performance
vpmovzxdq
__m128i _mm_mask_cvtepu32_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepu32_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxdq
__m128i _mm_maskz_cvtepu32_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepu32_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxdq
__m256i _mm256_cvtepu32_epi64 (__m128i a)
Synopsis
__m256i _mm256_cvtepu32_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq ymm, xmm
CPUID Flags: AVX2
Description
Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j:= 0 to 3
i := 64*j
k := 32*j
dst[i+63:i] := ZeroExtend(a[k+31:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovzxdq
__m256i _mm256_mask_cvtepu32_epi64 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepu32_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxdq
__m256i _mm256_maskz_cvtepu32_epi64 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepu32_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxdq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxdq
__m512i _mm512_cvtepu32_epi64 (__m256i a)
Synopsis
__m512i _mm512_cvtepu32_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vpmovzxdq zmm {k}, ymm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 32*j
dst[i+63:i] := ZeroExtend(a[k+31:k])
ENDFOR
dst[MAX:512] := 0
vpmovzxdq
__m512i _mm512_mask_cvtepu32_epi64 (__m512i src, __mmask8 k, __m256i a)
Synopsis
__m512i _mm512_mask_cvtepu32_epi64 (__m512i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxdq zmm {k}, ymm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovzxdq
__m512i _mm512_maskz_cvtepu32_epi64 (__mmask8 k, __m256i a)
Synopsis
__m512i _mm512_maskz_cvtepu32_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxdq zmm {k}, ymm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtudq2pd
__m128d _mm_cvtepu32_pd (__m128i a)
Synopsis
__m128d _mm_cvtepu32_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ENDFOR
dst[MAX:128] := 0
vcvtudq2pd
__m128d _mm_mask_cvtepu32_pd (__m128d src, __mmask8 k, __m128i a)
Synopsis
__m128d _mm_mask_cvtepu32_pd (__m128d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtudq2pd
__m128d _mm_maskz_cvtepu32_pd (__mmask8 k, __m128i a)
Synopsis
__m128d _mm_maskz_cvtepu32_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtudq2pd
__m256d _mm256_cvtepu32_pd (__m128i a)
Synopsis
__m256d _mm256_cvtepu32_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ENDFOR
dst[MAX:256] := 0
vcvtudq2pd
__m256d _mm256_mask_cvtepu32_pd (__m256d src, __mmask8 k, __m128i a)
Synopsis
__m256d _mm256_mask_cvtepu32_pd (__m256d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtudq2pd
__m256d _mm256_maskz_cvtepu32_pd (__mmask8 k, __m128i a)
Synopsis
__m256d _mm256_maskz_cvtepu32_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtudq2pd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtudq2pd
__m512d _mm512_cvtepu32_pd (__m256i a)
Synopsis
__m512d _mm512_cvtepu32_pd (__m256i a)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0
vcvtudq2pd
__m512d _mm512_mask_cvtepu32_pd (__m512d src, __mmask8 k, __m256i a)
Synopsis
__m512d _mm512_mask_cvtepu32_pd (__m512d src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtudq2pd
__m512d _mm512_maskz_cvtepu32_pd (__mmask8 k, __m256i a)
Synopsis
__m512d _mm512_maskz_cvtepu32_pd (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := ConvertUnsignedIntegerTo_FP64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_cvtepu32_ps (__m512i a)
Synopsis
__m512 _mm512_cvtepu32_ps (__m512i a)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_mask_cvtepu32_ps (__m512 src, __mmask16 k, __m512i a)
Synopsis
__m512 _mm512_mask_cvtepu32_ps (__m512 src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtudq2ps
__m512 _mm512_maskz_cvtepu32_ps (__mmask16 k, __m512i a)
Synopsis
__m512 _mm512_maskz_cvtepu32_ps (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtudq2ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := ConvertUnsignedInt32_To_FP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtudq2pd
__m512d _mm512_cvtepu32lo_pd (__m512i v2)
Synopsis
__m512d _mm512_cvtepu32lo_pd (__m512i v2)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*32
k := j*64
dst[k+63:k] := UInt32ToFloat64(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtudq2pd
__m512d _mm512_mask_cvtepu32lo_pd (__m512d src, __mmask8 k, __m512i v2)
Synopsis
__m512d _mm512_mask_cvtepu32lo_pd (__m512d src, __mmask8 k, __m512i v2)
#include "immintrin.h"
Instruction: vcvtudq2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
dst[l+63:l] := UInt32ToFloat64(v2[i+31:i])
ELSE
dst[l+63:l] := src[l+63:l]
FI
ENDFOR
dst[MAX:512] := 0
vcvtuqq2pd
__m128d _mm_cvtepu64_pd (__m128i a)
Synopsis
__m128d _mm_cvtepu64_pd (__m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vcvtuqq2pd
__m128d _mm_mask_cvtepu64_pd (__m128d src, __mmask8 k, __m128i a)
Synopsis
__m128d _mm_mask_cvtepu64_pd (__m128d src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtuqq2pd
__m128d _mm_maskz_cvtepu64_pd (__mmask8 k, __m128i a)
Synopsis
__m128d _mm_maskz_cvtepu64_pd (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtuqq2pd
__m256d _mm256_cvtepu64_pd (__m256i a)
Synopsis
__m256d _mm256_cvtepu64_pd (__m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvtuqq2pd
__m256d _mm256_mask_cvtepu64_pd (__m256d src, __mmask8 k, __m256i a)
Synopsis
__m256d _mm256_mask_cvtepu64_pd (__m256d src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtuqq2pd
__m256d _mm256_maskz_cvtepu64_pd (__mmask8 k, __m256i a)
Synopsis
__m256d _mm256_maskz_cvtepu64_pd (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtuqq2pd
__m512d _mm512_cvtepu64_pd (__m512i a)
Synopsis
__m512d _mm512_cvtepu64_pd (__m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_mask_cvtepu64_pd (__m512d src, __mmask8 k, __m512i a)
Synopsis
__m512d _mm512_mask_cvtepu64_pd (__m512d src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtuqq2pd
__m512d _mm512_maskz_cvtepu64_pd (__mmask8 k, __m512i a)
Synopsis
__m512d _mm512_maskz_cvtepu64_pd (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2pd
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ConvertUnsignedInt64_To_FP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtuqq2ps
__m128 _mm_cvtepu64_ps (__m128i a)
Synopsis
__m128 _mm_cvtepu64_ps (__m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vcvtuqq2ps
__m128 _mm_mask_cvtepu64_ps (__m128 src, __mmask8 k, __m128i a)
Synopsis
__m128 _mm_mask_cvtepu64_ps (__m128 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:64] := 0
vcvtuqq2ps
__m128 _mm_maskz_cvtepu64_ps (__mmask8 k, __m128i a)
Synopsis
__m128 _mm_maskz_cvtepu64_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vcvtuqq2ps
__m128 _mm256_cvtepu64_ps (__m256i a)
Synopsis
__m128 _mm256_cvtepu64_ps (__m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vcvtuqq2ps
__m128 _mm256_mask_cvtepu64_ps (__m128 src, __mmask8 k, __m256i a)
Synopsis
__m128 _mm256_mask_cvtepu64_ps (__m128 src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:128] := 0
vcvtuqq2ps
__m128 _mm256_maskz_cvtepu64_ps (__mmask8 k, __m256i a)
Synopsis
__m128 _mm256_maskz_cvtepu64_ps (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtuqq2ps
__m256 _mm512_cvtepu64_ps (__m512i a)
Synopsis
__m256 _mm512_cvtepu64_ps (__m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvtuqq2ps
__m256 _mm512_mask_cvtepu64_ps (__m256 src, __mmask8 k, __m512i a)
Synopsis
__m256 _mm512_mask_cvtepu64_ps (__m256 src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:256] := 0
vcvtuqq2ps
__m256 _mm512_maskz_cvtepu64_ps (__mmask8 k, __m512i a)
Synopsis
__m256 _mm512_maskz_cvtepu64_ps (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vcvtuqq2ps
CPUID Flags: AVX512DQ
Description
Convert packed unsigned 64-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := ConvertUnsignedInt64_To_FP32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
pmovzxbw
__m128i _mm_cvtepu8_epi16 (__m128i a)
Synopsis
__m128i _mm_cvtepu8_epi16 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxbw xmm, xmm
CPUID Flags: SSE4.1
Description
Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*8
l := j*16
dst[l+15:l] := ZeroExtend(a[i+7:i])
ENDFOR
Performance
vpmovzxbw
__m128i _mm_mask_cvtepu8_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepu8_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512VL + AVX512BW
Description
Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := ZeroExtend(a[i+7:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxbw
__m128i _mm_maskz_cvtepu8_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepu8_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512VL + AVX512BW
Description
Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := ZeroExtend(a[i+7:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxbw
__m256i _mm256_cvtepu8_epi16 (__m128i a)
Synopsis
__m256i _mm256_cvtepu8_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw ymm, xmm
CPUID Flags: AVX2
Description
Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
l := j*16
dst[l+15:l] := ZeroExtend(a[i+7:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovzxbw
__m256i _mm256_mask_cvtepu8_epi16 (__m256i src, __mmask16 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepu8_epi16 (__m256i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512VL + AVX512BW
Description
Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := ZeroExtend(a[i+7:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxbw
__m256i _mm256_maskz_cvtepu8_epi16 (__mmask16 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepu8_epi16 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512VL + AVX512BW
Description
Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := ZeroExtend(a[i+7:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxbw
__m512i _mm512_cvtepu8_epi16 (__m256i a)
Synopsis
__m512i _mm512_cvtepu8_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512BW
Description
Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
l := j*16
dst[l+15:l] := ZeroExtend(a[i+7:i])
ENDFOR
dst[MAX:512] := 0
vpmovzxbw
__m512i _mm512_mask_cvtepu8_epi16 (__m512i src, __mmask32 k, __m256i a)
Synopsis
__m512i _mm512_mask_cvtepu8_epi16 (__m512i src, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512BW
Description
Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := ZeroExtend(a[i+7:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:512] := 0
vpmovzxbw
__m512i _mm512_maskz_cvtepu8_epi16 (__mmask32 k, __m256i a)
Synopsis
__m512i _mm512_maskz_cvtepu8_epi16 (__mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovzxbw
CPUID Flags: AVX512BW
Description
Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
l := j*16
IF k[j]
dst[l+15:l] := ZeroExtend(a[i+7:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmovzxbd
__m128i _mm_cvtepu8_epi32 (__m128i a)
Synopsis
__m128i _mm_cvtepu8_epi32 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxbd xmm, xmm
CPUID Flags: SSE4.1
Description
Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 8*j
dst[i+31:i] := ZeroExtend(a[k+7:k])
ENDFOR
Performance
vpmovzxbd
__m128i _mm_mask_cvtepu8_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepu8_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxbd
__m128i _mm_maskz_cvtepu8_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepu8_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 8-bit integers in th elow 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxbd
__m256i _mm256_cvtepu8_epi32 (__m128i a)
Synopsis
__m256i _mm256_cvtepu8_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd ymm, xmm
CPUID Flags: AVX2
Description
Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 8*j
dst[i+31:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovzxbd
__m256i _mm256_mask_cvtepu8_epi32 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepu8_epi32 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxbd
__m256i _mm256_maskz_cvtepu8_epi32 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepu8_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxbd
__m512i _mm512_cvtepu8_epi32 (__m128i a)
Synopsis
__m512i _mm512_cvtepu8_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 8*j
dst[i+31:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0
vpmovzxbd
__m512i _mm512_mask_cvtepu8_epi32 (__m512i src, __mmask16 k, __m128i a)
Synopsis
__m512i _mm512_mask_cvtepu8_epi32 (__m512i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovzxbd
__m512i _mm512_maskz_cvtepu8_epi32 (__mmask16 k, __m128i a)
Synopsis
__m512i _mm512_maskz_cvtepu8_epi32 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbd zmm {k}, xmm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[i+31:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmovzxbq
__m128i _mm_cvtepu8_epi64 (__m128i a)
Synopsis
__m128i _mm_cvtepu8_epi64 (__m128i a)
#include "smmintrin.h"
Instruction: pmovzxbq xmm, xmm
CPUID Flags: SSE4.1
Description
Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 8*j
dst[i+63:i] := ZeroExtend(a[k+7:k])
ENDFOR
Performance
vpmovzxbq
__m128i _mm_mask_cvtepu8_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtepu8_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxbq
__m128i _mm_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovzxbq
__m256i _mm256_cvtepu8_epi64 (__m128i a)
Synopsis
__m256i _mm256_cvtepu8_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq ymm, xmm
CPUID Flags: AVX2
Description
Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 8*j
dst[i+63:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0
Performance
vpmovzxbq
__m256i _mm256_mask_cvtepu8_epi64 (__m256i src, __mmask8 k, __m128i a)
Synopsis
__m256i _mm256_mask_cvtepu8_epi64 (__m256i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxbq
__m256i _mm256_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)
Synopsis
__m256i _mm256_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq
CPUID Flags: AVX512VL + AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovzxbq
__m512i _mm512_cvtepu8_epi64 (__m128i a)
Synopsis
__m512i _mm512_cvtepu8_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 8*j
dst[i+63:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:512] := 0
vpmovzxbq
__m512i _mm512_mask_cvtepu8_epi64 (__m512i src, __mmask8 k, __m128i a)
Synopsis
__m512i _mm512_mask_cvtepu8_epi64 (__m512i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmovzxbq
__m512i _mm512_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)
Synopsis
__m512i _mm512_maskz_cvtepu8_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovzxbq zmm {k}, xmm
CPUID Flags: AVX512F
Description
Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[i+63:i] := ZeroExtend(a[l+7:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtfxpntdq2ps
__m512 _mm512_cvtfxpnt_round_adjustepi32_ps (__m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512 _mm512_cvtfxpnt_round_adjustepi32_ps (__m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntdq2ps zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Performs element-by-element conversion of packed 32-bit integer elements in
v2 to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using
expadj, storing the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := Int32ToFloat32(v2[i+31:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ENDFOR
dst[MAX:512] := 0
vcvtfxpntudq2ps
__m512 _mm512_cvtfxpnt_round_adjustepu32_ps (__m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512 _mm512_cvtfxpnt_round_adjustepu32_ps (__m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntudq2ps zmm {k}, zmm, imm
CPUID Flags: KNCNI
Description
Performs element-by-element conversion of packed 32-bit unsigned integer elements in
v2 to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using
expadj, storing the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := UInt32ToFloat32(v2[i+31:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ENDFOR
dst[MAX:512] := 0
vcvtfxpntudq2ps
__m512 _mm512_mask_cvtfxpnt_round_adjustepu32_ps (__m512 src, __mmask16 k, __m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512 _mm512_mask_cvtfxpnt_round_adjustepu32_ps (__m512 src, __mmask16 k, __m512i v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntudq2ps zmm {k}, zmm, imm
CPUID Flags: KNCNI
Description
Performs element-by-element conversion of packed 32-bit unsigned integer elements in
v2 to packed single-precision (32-bit) floating-point elements and performing an optional exponent adjust using
expadj, storing the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := Int32ToFloat32(v2[i+31:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtfxpntps2dq
__m512i _mm512_cvtfxpnt_round_adjustps_epi32 (__m512 v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512i _mm512_cvtfxpnt_round_adjustps_epi32 (__m512 v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntps2dq zmm {k}, zmm, imm
CPUID Flags: KNCNI
Description
Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in
v2 to packed 32-bit integer elements and performs an optional exponent adjust using
expadj, storing the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := Float32ToInt32(v2[i+31:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ENDFOR
dst[MAX:512] := 0
vcvtfxpntps2udq
__m512i _mm512_cvtfxpnt_round_adjustps_epu32 (__m512 v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512i _mm512_cvtfxpnt_round_adjustps_epu32 (__m512 v2, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vcvtfxpntps2udq zmm {k}, zmm, imm
CPUID Flags: KNCNI
Description
Performs element-by-element conversion of packed single-precision (32-bit) floating-point elements in
v2 to packed 32-bit unsigned integer elements and performing an optional exponent adjust using
expadj, storing the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := Float32ToUInt32(v2[i+31:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] 0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] 4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] 5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] 8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] 16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] 24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] 31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] 32
ESAC
ENDFOR
dst[MAX:512] := 0
vcvtfxpntpd2dq
__m512i _mm512_cvtfxpnt_roundpd_epi32lo (__m512d v2, int rounding)
Synopsis
__m512i _mm512_cvtfxpnt_roundpd_epi32lo (__m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtfxpntpd2dq zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector
v2 to 32-bit integer elements, storing them in the lower half of
dst. The elements in the upper half of
dst are set to 0.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
k := j*32
dst[k+31:k] := Float64ToInt32(v2[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtfxpntpd2dq
__m512i _mm512_mask_cvtfxpnt_roundpd_epi32lo (__m512i src, __mmask8 k, __m512d v2, int rounding)
Synopsis
__m512i _mm512_mask_cvtfxpnt_roundpd_epi32lo (__m512i src, __mmask8 k, __m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtfxpntpd2dq zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Performs an element-by-element conversion of elements in packed double-precision (64-bit) floating-point vector
v2 to 32-bit integer elements, storing them in the lower half of
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). The elements in the upper half of
dst are set to 0.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Float64ToInt32(v2[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:512] := 0
vcvtfxpntpd2udq
__m512i _mm512_cvtfxpnt_roundpd_epu32lo (__m512d v2, int rounding)
Synopsis
__m512i _mm512_cvtfxpnt_roundpd_epu32lo (__m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtfxpntpd2udq zmm {k}, zmm, imm
CPUID Flags: KNCNI
Description
Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in
v2 to packed 32-bit unsigned integer elements, storing the results in
dst. Results are written to the lower half of
dst, and the upper half locations are set to '0'.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
k := j*32
dst[k+31:k] := Float64ToInt32(v2[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtfxpntpd2udq
__m512i _mm512_mask_cvtfxpnt_roundpd_epu32lo (__m512i src, __mmask8 k, __m512d v2, int rounding)
Synopsis
__m512i _mm512_mask_cvtfxpnt_roundpd_epu32lo (__m512i src, __mmask8 k, __m512d v2, int rounding)
#include "immintrin.h"
Instruction: vcvtfxpntpd2udq zmm {k}, zmm, imm
CPUID Flags: KNCNI
Description
Performs element-by-element conversion of packed double-precision (64-bit) floating-point elements in
v2 to packed 32-bit unsigned integer elements, storing the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). Results are written to the lower half of
dst, and the upper half locations are set to '0'.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Float64ToInt32(v2[i+63:i])
ELSE
dst[l+31:l] := src[l+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtsi2sd
__m128d _mm_cvti32_sd (__m128d a, int b)
Synopsis
__m128d _mm_cvti32_sd (__m128d a, int b)
#include "immintrin.h"
Instruction: vcvtsi2sd xmm, xmm, r32
CPUID Flags: AVX512F
Description
Convert the 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := Convert_Int32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vcvtsi2ss
__m128 _mm_cvti32_ss (__m128 a, int b)
Synopsis
__m128 _mm_cvti32_ss (__m128 a, int b)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r32
CPUID Flags: AVX512F
Description
Convert the 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vcvtsi2sd
__m128d _mm_cvti64_sd (__m128d a, __int64 b)
Synopsis
__m128d _mm_cvti64_sd (__m128d a, __int64 b)
#include "immintrin.h"
Instruction: vcvtsi2sd xmm, xmm, r64
CPUID Flags: AVX512F
Description
Convert the 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := Convert_Int64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vcvtsi2ss
__m128 _mm_cvti64_ss (__m128 a, __int64 b)
Synopsis
__m128 _mm_cvti64_ss (__m128 a, __int64 b)
#include "immintrin.h"
Instruction: vcvtsi2ss xmm, xmm, r64
CPUID Flags: AVX512F
Description
Convert the 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := Convert_Int64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
cvtpd2dq
__m128i _mm_cvtpd_epi32 (__m128d a)
Synopsis
__m128i _mm_cvtpd_epi32 (__m128d a)
#include "emmintrin.h"
Instruction: cvtpd2dq xmm, xmm
CPUID Flags: SSE2
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
Performance
vcvtpd2dq
__m128i _mm_mask_cvtpd_epi32 (__m128i src, __mmask8 k, __m128d a)
Synopsis
__m128i _mm_mask_cvtpd_epi32 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:64] := 0
vcvtpd2dq
__m128i _mm_maskz_cvtpd_epi32 (__mmask8 k, __m128d a)
Synopsis
__m128i _mm_maskz_cvtpd_epi32 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:64] := 0
vcvtpd2dq
__m128i _mm256_cvtpd_epi32 (__m256d a)
Synopsis
__m128i _mm256_cvtpd_epi32 (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2dq xmm, ymm
CPUID Flags: AVX
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
dst[MAX:128] := 0
Performance
vcvtpd2dq
__m128i _mm256_mask_cvtpd_epi32 (__m128i src, __mmask8 k, __m256d a)
Synopsis
__m128i _mm256_mask_cvtpd_epi32 (__m128i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2dq
__m128i _mm256_maskz_cvtpd_epi32 (__mmask8 k, __m256d a)
Synopsis
__m128i _mm256_maskz_cvtpd_epi32 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2dq
__m256i _mm512_cvtpd_epi32 (__m512d a)
Synopsis
__m256i _mm512_cvtpd_epi32 (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvtpd2dq
__m256i _mm512_mask_cvtpd_epi32 (__m256i src, __mmask8 k, __m512d a)
Synopsis
__m256i _mm512_mask_cvtpd_epi32 (__m256i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2dq
__m256i _mm512_maskz_cvtpd_epi32 (__mmask8 k, __m512d a)
Synopsis
__m256i _mm512_maskz_cvtpd_epi32 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2dq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2qq
__m128i _mm_cvtpd_epi64 (__m128d a)
Synopsis
__m128i _mm_cvtpd_epi64 (__m128d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vcvtpd2qq
__m128i _mm_mask_cvtpd_epi64 (__m128i src, __mmask8 k, __m128d a)
Synopsis
__m128i _mm_mask_cvtpd_epi64 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2qq
__m128i _mm_maskz_cvtpd_epi64 (__mmask8 k, __m128d a)
Synopsis
__m128i _mm_maskz_cvtpd_epi64 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2qq
__m256i _mm256_cvtpd_epi64 (__m256d a)
Synopsis
__m256i _mm256_cvtpd_epi64 (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvtpd2qq
__m256i _mm256_mask_cvtpd_epi64 (__m256i src, __mmask8 k, __m256d a)
Synopsis
__m256i _mm256_mask_cvtpd_epi64 (__m256i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2qq
__m256i _mm256_maskz_cvtpd_epi64 (__mmask8 k, __m256d a)
Synopsis
__m256i _mm256_maskz_cvtpd_epi64 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2qq
__m512i _mm512_cvtpd_epi64 (__m512d a)
Synopsis
__m512i _mm512_cvtpd_epi64 (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtpd2qq
__m512i _mm512_mask_cvtpd_epi64 (__m512i src, __mmask8 k, __m512d a)
Synopsis
__m512i _mm512_mask_cvtpd_epi64 (__m512i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtpd2qq
__m512i _mm512_maskz_cvtpd_epi64 (__mmask8 k, __m512d a)
Synopsis
__m512i _mm512_maskz_cvtpd_epi64 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtpd2udq
__m128i _mm_cvtpd_epu32 (__m128d a)
Synopsis
__m128i _mm_cvtpd_epu32 (__m128d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:64] := 0
vcvtpd2udq
__m128i _mm_mask_cvtpd_epu32 (__m128i src, __mmask8 k, __m128d a)
Synopsis
__m128i _mm_mask_cvtpd_epu32 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:64] := 0
vcvtpd2udq
__m128i _mm_maskz_cvtpd_epu32 (__mmask8 k, __m128d a)
Synopsis
__m128i _mm_maskz_cvtpd_epu32 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:64] := 0
vcvtpd2udq
__m128i _mm256_cvtpd_epu32 (__m256d a)
Synopsis
__m128i _mm256_cvtpd_epu32 (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:128] := 0
vcvtpd2udq
__m128i _mm256_mask_cvtpd_epu32 (__m128i src, __mmask8 k, __m256d a)
Synopsis
__m128i _mm256_mask_cvtpd_epu32 (__m128i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2udq
__m128i _mm256_maskz_cvtpd_epu32 (__mmask8 k, __m256d a)
Synopsis
__m128i _mm256_maskz_cvtpd_epu32 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2udq
__m256i _mm512_cvtpd_epu32 (__m512d a)
Synopsis
__m256i _mm512_cvtpd_epu32 (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvtpd2udq
__m256i _mm512_mask_cvtpd_epu32 (__m256i src, __mmask8 k, __m512d a)
Synopsis
__m256i _mm512_mask_cvtpd_epu32 (__m256i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2udq
__m256i _mm512_maskz_cvtpd_epu32 (__mmask8 k, __m512d a)
Synopsis
__m256i _mm512_maskz_cvtpd_epu32 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2udq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2uqq
__m128i _mm_cvtpd_epu64 (__m128d a)
Synopsis
__m128i _mm_cvtpd_epu64 (__m128d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vcvtpd2uqq
__m128i _mm_mask_cvtpd_epu64 (__m128i src, __mmask8 k, __m128d a)
Synopsis
__m128i _mm_mask_cvtpd_epu64 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2uqq
__m128i _mm_maskz_cvtpd_epu64 (__mmask8 k, __m128d a)
Synopsis
__m128i _mm_maskz_cvtpd_epu64 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2uqq
__m256i _mm256_cvtpd_epu64 (__m256d a)
Synopsis
__m256i _mm256_cvtpd_epu64 (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvtpd2uqq
__m256i _mm256_mask_cvtpd_epu64 (__m256i src, __mmask8 k, __m256d a)
Synopsis
__m256i _mm256_mask_cvtpd_epu64 (__m256i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2uqq
__m256i _mm256_maskz_cvtpd_epu64 (__mmask8 k, __m256d a)
Synopsis
__m256i _mm256_maskz_cvtpd_epu64 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2uqq
__m512i _mm512_cvtpd_epu64 (__m512d a)
Synopsis
__m512i _mm512_cvtpd_epu64 (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtpd2uqq
__m512i _mm512_mask_cvtpd_epu64 (__m512i src, __mmask8 k, __m512d a)
Synopsis
__m512i _mm512_mask_cvtpd_epu64 (__m512i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtpd2uqq
__m512i _mm512_maskz_cvtpd_epu64 (__mmask8 k, __m512d a)
Synopsis
__m512i _mm512_maskz_cvtpd_epu64 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
cvtpd2pi
__m64 _mm_cvtpd_pi32 (__m128d a)
Synopsis
__m64 _mm_cvtpd_pi32 (__m128d a)
#include "emmintrin.h"
Instruction: cvtpd2pi mm, xmm
CPUID Flags: SSE2
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_Int32(a[k+63:k])
ENDFOR
cvtpd2ps
__m128 _mm_cvtpd_ps (__m128d a)
Synopsis
__m128 _mm_cvtpd_ps (__m128d a)
#include "emmintrin.h"
Instruction: cvtpd2ps xmm, xmm
CPUID Flags: SSE2
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
ENDFOR
Performance
vcvtpd2ps
__m128 _mm_mask_cvtpd_ps (__m128 src, __mmask8 k, __m128d a)
Synopsis
__m128 _mm_mask_cvtpd_ps (__m128 src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:64] := 0
vcvtpd2ps
__m128 _mm_maskz_cvtpd_ps (__mmask8 k, __m128d a)
Synopsis
__m128 _mm_maskz_cvtpd_ps (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvtpd2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:64] := 0
vcvtpd2ps
__m128 _mm256_cvtpd_ps (__m256d a)
Synopsis
__m128 _mm256_cvtpd_ps (__m256d a)
#include "immintrin.h"
Instruction: vcvtpd2ps xmm, ymm
CPUID Flags: AVX
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
ENDFOR
dst[MAX:128] := 0
Performance
vcvtpd2ps
__m128 _mm256_mask_cvtpd_ps (__m128 src, __mmask8 k, __m256d a)
Synopsis
__m128 _mm256_mask_cvtpd_ps (__m128 src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2ps
__m128 _mm256_maskz_cvtpd_ps (__mmask8 k, __m256d a)
Synopsis
__m128 _mm256_maskz_cvtpd_ps (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvtpd2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtpd2ps
__m256 _mm512_cvtpd_ps (__m512d a)
Synopsis
__m256 _mm512_cvtpd_ps (__m512d a)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_FP32(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvtpd2ps
__m256 _mm512_mask_cvtpd_ps (__m256 src, __mmask8 k, __m512d a)
Synopsis
__m256 _mm512_mask_cvtpd_ps (__m256 src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2ps
__m256 _mm512_maskz_cvtpd_ps (__mmask8 k, __m512d a)
Synopsis
__m256 _mm512_maskz_cvtpd_ps (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvtpd2ps ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
dst[i+31:i] := Convert_FP64_To_FP32(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtpd2ps
__m512 _mm512_cvtpd_pslo (__m512d v2)
Synopsis
__m512 _mm512_cvtpd_pslo (__m512d v2)
#include "immintrin.h"
Instruction: vcvtpd2ps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
Operation
FOR j := 0 to 7
i := j*64
k := j*32
dst[k+31:k] := Float64ToFloat32(v2[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvtpd2ps
__m512 _mm512_mask_cvtpd_pslo (__m512 src, __mmask8 k, __m512d v2)
Synopsis
__m512 _mm512_mask_cvtpd_pslo (__m512 src, __mmask8 k, __m512d v2)
#include "immintrin.h"
Instruction: vcvtpd2ps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[l+31:l] := Float64ToFloat32(v2[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:512] := 0
vcvtph2ps
__m128 _mm_cvtph_ps (__m128i a)
Synopsis
__m128 _mm_cvtph_ps (__m128i a)
#include "emmintrin.h"
Instruction: vcvtph2ps xmm, xmm
CPUID Flags: FP16C
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
m := j*16
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ENDFOR
dst[MAX:128] := 0
Performance
vcvtph2ps
__m128 _mm_mask_cvtph_ps (__m128 src, __mmask8 k, __m128i a)
Synopsis
__m128 _mm_mask_cvtph_ps (__m128 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
m := j*16
IF k[j]
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtph2ps
__m128 _mm_maskz_cvtph_ps (__mmask8 k, __m128i a)
Synopsis
__m128 _mm_maskz_cvtph_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
m := j*16
IF k[j]
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtph2ps
__m256 _mm256_cvtph_ps (__m128i a)
Synopsis
__m256 _mm256_cvtph_ps (__m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps ymm, xmm
CPUID Flags: FP16C
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
m := j*16
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ENDFOR
dst[MAX:256] := 0
Performance
vcvtph2ps
__m256 _mm256_mask_cvtph_ps (__m256 src, __mmask8 k, __m128i a)
Synopsis
__m256 _mm256_mask_cvtph_ps (__m256 src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
m := j*16
IF k[j]
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtph2ps
__m256 _mm256_maskz_cvtph_ps (__mmask8 k, __m128i a)
Synopsis
__m256 _mm256_maskz_cvtph_ps (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vcvtph2ps
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
m := j*16
IF k[j]
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtph2ps
__m512 _mm512_cvtph_ps (__m256i a)
Synopsis
__m512 _mm512_cvtph_ps (__m256i a)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
m := j*16
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ENDFOR
dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_mask_cvtph_ps (__m512 src, __mmask16 k, __m256i a)
Synopsis
__m512 _mm512_mask_cvtph_ps (__m512 src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
m := j*16
IF k[j]
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtph2ps
__m512 _mm512_maskz_cvtph_ps (__mmask16 k, __m256i a)
Synopsis
__m512 _mm512_maskz_cvtph_ps (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vcvtph2ps zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
m := j*16
IF k[j]
dst[i+31:i] := Convert_FP16_To_FP32(a[m+15:m])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_cvtpi16_ps (__m64 a)
Synopsis
__m128 _mm_cvtpi16_ps (__m64 a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Convert packed 16-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*16
m := j*32
dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
ENDFOR
cvtpi2pd
__m128d _mm_cvtpi32_pd (__m64 a)
Synopsis
__m128d _mm_cvtpi32_pd (__m64 a)
#include "emmintrin.h"
Instruction: cvtpi2pd xmm, mm
CPUID Flags: SSE2
Description
Convert packed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*32
m := j*64
dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
ENDFOR
cvtpi2ps
__m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
Synopsis
__m128 _mm_cvtpi32_ps (__m128 a, __m64 b)
#include "xmmintrin.h"
Instruction: cvtpi2ps xmm, mm
CPUID Flags: SSE
Description
Convert packed 32-bit integers in b to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of dst, and copy the upper 2 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[63:32] := Convert_Int32_To_FP32(b[63:32])
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
Performance
...
__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
Synopsis
__m128 _mm_cvtpi32x2_ps (__m64 a, __m64 b)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Convert packed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of dst, then covert the packed 32-bit integers in a to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of dst.
Operation
dst[31:0] := Convert_Int32_To_FP32(a[31:0])
dst[63:32] := Convert_Int32_To_FP32(a[63:32])
dst[95:64] := Convert_Int32_To_FP32(b[31:0])
dst[127:96] := Convert_Int32_To_FP32(b[63:32])
...
__m128 _mm_cvtpi8_ps (__m64 a)
Synopsis
__m128 _mm_cvtpi8_ps (__m64 a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Convert the lower packed 8-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*8
m := j*32
dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
ENDFOR
cvtps2dq
__m128i _mm_cvtps_epi32 (__m128 a)
Synopsis
__m128i _mm_cvtps_epi32 (__m128 a)
#include "emmintrin.h"
Instruction: cvtps2dq xmm, xmm
CPUID Flags: SSE2
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
Performance
vcvtps2dq
__m128i _mm_mask_cvtps_epi32 (__m128i src, __mmask8 k, __m128 a)
Synopsis
__m128i _mm_mask_cvtps_epi32 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2dq
__m128i _mm_maskz_cvtps_epi32 (__mmask8 k, __m128 a)
Synopsis
__m128i _mm_maskz_cvtps_epi32 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2dq
__m256i _mm256_cvtps_epi32 (__m256 a)
Synopsis
__m256i _mm256_cvtps_epi32 (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2dq ymm, ymm
CPUID Flags: AVX
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vcvtps2dq
__m256i _mm256_mask_cvtps_epi32 (__m256i src, __mmask8 k, __m256 a)
Synopsis
__m256i _mm256_mask_cvtps_epi32 (__m256i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2dq
__m256i _mm256_maskz_cvtps_epi32 (__mmask8 k, __m256 a)
Synopsis
__m256i _mm256_maskz_cvtps_epi32 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2dq
__m512i _mm512_cvtps_epi32 (__m512 a)
Synopsis
__m512i _mm512_cvtps_epi32 (__m512 a)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_mask_cvtps_epi32 (__m512i src, __mmask16 k, __m512 a)
Synopsis
__m512i _mm512_mask_cvtps_epi32 (__m512i src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2dq
__m512i _mm512_maskz_cvtps_epi32 (__mmask16 k, __m512 a)
Synopsis
__m512i _mm512_maskz_cvtps_epi32 (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvtps2dq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2qq
__m128i _mm_cvtps_epi64 (__m128 a)
Synopsis
__m128i _mm_cvtps_epi64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:128] := 0
vcvtps2qq
__m128i _mm_mask_cvtps_epi64 (__m128i src, __mmask8 k, __m128 a)
Synopsis
__m128i _mm_mask_cvtps_epi64 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2qq
__m128i _mm_maskz_cvtps_epi64 (__mmask8 k, __m128 a)
Synopsis
__m128i _mm_maskz_cvtps_epi64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2qq
__m256i _mm256_cvtps_epi64 (__m128 a)
Synopsis
__m256i _mm256_cvtps_epi64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:256] := 0
vcvtps2qq
__m256i _mm256_mask_cvtps_epi64 (__m256i src, __mmask8 k, __m128 a)
Synopsis
__m256i _mm256_mask_cvtps_epi64 (__m256i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2qq
__m256i _mm256_maskz_cvtps_epi64 (__mmask8 k, __m128 a)
Synopsis
__m256i _mm256_maskz_cvtps_epi64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2qq
__m512i _mm512_cvtps_epi64 (__m256 a)
Synopsis
__m512i _mm512_cvtps_epi64 (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_mask_cvtps_epi64 (__m512i src, __mmask8 k, __m256 a)
Synopsis
__m512i _mm512_mask_cvtps_epi64 (__m512i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2qq
__m512i _mm512_maskz_cvtps_epi64 (__mmask8 k, __m256 a)
Synopsis
__m512i _mm512_maskz_cvtps_epi64 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2udq
__m128i _mm_cvtps_epu32 (__m128 a)
Synopsis
__m128i _mm_cvtps_epu32 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vcvtps2udq
__m128i _mm_mask_cvtps_epu32 (__m128i src, __mmask8 k, __m128 a)
Synopsis
__m128i _mm_mask_cvtps_epu32 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2udq
__m128i _mm_maskz_cvtps_epu32 (__mmask8 k, __m128 a)
Synopsis
__m128i _mm_maskz_cvtps_epu32 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2udq
__m256i _mm256_cvtps_epu32 (__m256 a)
Synopsis
__m256i _mm256_cvtps_epu32 (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vcvtps2udq
__m256i _mm256_mask_cvtps_epu32 (__m256i src, __mmask8 k, __m256 a)
Synopsis
__m256i _mm256_mask_cvtps_epu32 (__m256i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2udq
__m256i _mm256_maskz_cvtps_epu32 (__mmask8 k, __m256 a)
Synopsis
__m256i _mm256_maskz_cvtps_epu32 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2udq
__m512i _mm512_cvtps_epu32 (__m512 a)
Synopsis
__m512i _mm512_cvtps_epu32 (__m512 a)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_mask_cvtps_epu32 (__m512i src, __mmask16 k, __m512 a)
Synopsis
__m512i _mm512_mask_cvtps_epu32 (__m512i src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2udq
__m512i _mm512_maskz_cvtps_epu32 (__mmask16 k, __m512 a)
Synopsis
__m512i _mm512_maskz_cvtps_epu32 (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvtps2udq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedInt32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2uqq
__m128i _mm_cvtps_epu64 (__m128 a)
Synopsis
__m128i _mm_cvtps_epu64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:128] := 0
vcvtps2uqq
__m128i _mm_mask_cvtps_epu64 (__m128i src, __mmask8 k, __m128 a)
Synopsis
__m128i _mm_mask_cvtps_epu64 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2uqq
__m128i _mm_maskz_cvtps_epu64 (__mmask8 k, __m128 a)
Synopsis
__m128i _mm_maskz_cvtps_epu64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2uqq
__m256i _mm256_cvtps_epu64 (__m128 a)
Synopsis
__m256i _mm256_cvtps_epu64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:256] := 0
vcvtps2uqq
__m256i _mm256_mask_cvtps_epu64 (__m256i src, __mmask8 k, __m128 a)
Synopsis
__m256i _mm256_mask_cvtps_epu64 (__m256i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2uqq
__m256i _mm256_maskz_cvtps_epu64 (__mmask8 k, __m128 a)
Synopsis
__m256i _mm256_maskz_cvtps_epu64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2uqq
__m512i _mm512_cvtps_epu64 (__m256 a)
Synopsis
__m512i _mm512_cvtps_epu64 (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ENDFOR
dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_mask_cvtps_epu64 (__m512i src, __mmask8 k, __m256 a)
Synopsis
__m512i _mm512_mask_cvtps_epu64 (__m512i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2uqq
__m512i _mm512_maskz_cvtps_epu64 (__mmask8 k, __m256 a)
Synopsis
__m512i _mm512_maskz_cvtps_epu64 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
cvtps2pd
__m128d _mm_cvtps_pd (__m128 a)
Synopsis
__m128d _mm_cvtps_pd (__m128 a)
#include "emmintrin.h"
Instruction: cvtps2pd xmm, xmm
CPUID Flags: SSE2
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 32*j
dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
ENDFOR
Performance
vcvtps2pd
__m256d _mm256_cvtps_pd (__m128 a)
Synopsis
__m256d _mm256_cvtps_pd (__m128 a)
#include "immintrin.h"
Instruction: vcvtps2pd ymm, xmm
CPUID Flags: AVX
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 32*j
dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
ENDFOR
dst[MAX:256] := 0
Performance
vcvtps2pd
__m512d _mm512_cvtps_pd (__m256 a)
Synopsis
__m512d _mm512_cvtps_pd (__m256 a)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 32*j
dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
ENDFOR
dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_mask_cvtps_pd (__m512d src, __mmask8 k, __m256 a)
Synopsis
__m512d _mm512_mask_cvtps_pd (__m512d src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_maskz_cvtps_pd (__mmask8 k, __m256 a)
Synopsis
__m512d _mm512_maskz_cvtps_pd (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, ymm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[i+63:i] := Convert_FP32_To_FP64(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvtps2ph
__m128i _mm_cvtps_ph (__m128 a, int rounding)
Synopsis
__m128i _mm_cvtps_ph (__m128 a, int rounding)
#include "emmintrin.h"
Instruction: vcvtps2ph xmm, xmm, imm
CPUID Flags: FP16C
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 3
i := 16*j
l := 32*j
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ENDFOR
dst[MAX:128] := 0
Performance
vcvtps2ph
__m128i _mm_mask_cvtps_ph (__m128i src, __mmask8 k, __m128 a, int rounding)
Synopsis
__m128i _mm_mask_cvtps_ph (__m128i src, __mmask8 k, __m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 3
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:64] := 0
vcvtps2ph
__m128i _mm_maskz_cvtps_ph (__mmask8 k, __m128 a, int rounding)
Synopsis
__m128i _mm_maskz_cvtps_ph (__mmask8 k, __m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 3
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:64] := 0
vcvtps2ph
__m128i _mm256_cvtps_ph (__m256 a, int rounding)
Synopsis
__m128i _mm256_cvtps_ph (__m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph xmm, ymm, imm
CPUID Flags: FP16C
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 16*j
l := 32*j
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ENDFOR
dst[MAX:128] := 0
Performance
vcvtps2ph
__m128i _mm256_mask_cvtps_ph (__m128i src, __mmask8 k, __m256 a, int rounding)
Synopsis
__m128i _mm256_mask_cvtps_ph (__m128i src, __mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2ph
__m128i _mm256_maskz_cvtps_ph (__mmask8 k, __m256 a, int rounding)
Synopsis
__m128i _mm256_maskz_cvtps_ph (__mmask8 k, __m256 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvtps2ph
__m256i _mm512_cvtps_ph (__m512 a, int rounding)
Synopsis
__m256i _mm512_cvtps_ph (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 16*j
l := 32*j
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ENDFOR
dst[MAX:256] := 0
vcvtps2ph
__m256i _mm512_mask_cvtps_ph (__m256i src, __mmask16 k, __m512 a, int rounding)
Synopsis
__m256i _mm512_mask_cvtps_ph (__m256i src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvtps2ph
__m256i _mm512_maskz_cvtps_ph (__mmask16 k, __m512 a, int rounding)
Synopsis
__m256i _mm512_maskz_cvtps_ph (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vcvtps2ph ymm {k}, zmm {sae}, imm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in
a to packed half-precision (16-bit) floating-point elements, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 16*j
l := 32*j
IF k[j]
dst[i+15:i] := Convert_FP32_To_FP16FP(a[l+31:l])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
...
__m64 _mm_cvtps_pi16 (__m128 a)
Synopsis
__m64 _mm_cvtps_pi16 (__m128 a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 16-bit integers, and store the results in dst.
Operation
FOR j := 0 to 3
i := 16*j
k := 32*j
dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
ENDFOR
cvtps2pi
__m64 _mm_cvtps_pi32 (__m128 a)
Synopsis
__m64 _mm_cvtps_pi32 (__m128 a)
#include "xmmintrin.h"
Instruction: cvtps2pi mm, xmm
CPUID Flags: SSE
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR
Performance
...
__m64 _mm_cvtps_pi8 (__m128 a)
Synopsis
__m64 _mm_cvtps_pi8 (__m128 a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 8-bit integers, and store the results in lower 4 elements of dst.
Operation
FOR j := 0 to 3
i := 8*j
k := 32*j
dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
ENDFOR
vcvtps2pd
__m512d _mm512_cvtpslo_pd (__m512 v2)
Synopsis
__m512d _mm512_cvtpslo_pd (__m512 v2)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*32
k := j*64
dst[k+63:k] := Float32ToFloat64(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvtps2pd
__m512d _mm512_mask_cvtpslo_pd (__m512d src, __mmask8 k, __m512 v2)
Synopsis
__m512d _mm512_mask_cvtpslo_pd (__m512d src, __mmask8 k, __m512 v2)
#include "immintrin.h"
Instruction: vcvtps2pd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
dst[l+63:l] := Float32ToFloat64(v2[i+31:i])
ELSE
dst[l+63:l] := src[l+63:l]:
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_cvtpu16_ps (__m64 a)
Synopsis
__m128 _mm_cvtpu16_ps (__m64 a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Convert packed unsigned 16-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*16
m := j*32
dst[m+31:m] := Convert_UnsignedInt16_To_FP32(a[i+15:i])
ENDFOR
...
__m128 _mm_cvtpu8_ps (__m64 a)
Synopsis
__m128 _mm_cvtpu8_ps (__m64 a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Convert the lower packed unsigned 8-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*8
m := j*32
dst[m+31:m] := Convert_UnsignedInt8_To_FP32(a[i+7:i])
ENDFOR
movsd
double _mm_cvtsd_f64 (__m128d a)
Synopsis
double _mm_cvtsd_f64 (__m128d a)
#include "emmintrin.h"
Instruction: movsd m64, xmm
CPUID Flags: SSE2
Description
Copy the lower double-precision (64-bit) floating-point element of a to dst.
Operation
dst[63:0] := a[63:0]
vcvtsd2si
int _mm_cvtsd_i32 (__m128d a)
Synopsis
int _mm_cvtsd_i32 (__m128d a)
#include "immintrin.h"
Instruction: vcvtsd2si r32, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
Operation
dst[31:0] := Convert_FP64_To_Int32(a[63:0])
vcvtsd2si
__int64 _mm_cvtsd_i64 (__m128d a)
Synopsis
__int64 _mm_cvtsd_i64 (__m128d a)
#include "immintrin.h"
Instruction: vcvtsd2si r64, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
Operation
dst[63:0] := Convert_FP64_To_Int64(a[63:0])
cvtsd2si
int _mm_cvtsd_si32 (__m128d a)
Synopsis
int _mm_cvtsd_si32 (__m128d a)
#include "emmintrin.h"
Instruction: cvtsd2si r32, xmm
CPUID Flags: SSE2
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
Operation
dst[31:0] := Convert_FP64_To_Int32(a[63:0])
Performance
cvtsd2si
__int64 _mm_cvtsd_si64 (__m128d a)
Synopsis
__int64 _mm_cvtsd_si64 (__m128d a)
#include "emmintrin.h"
Instruction: cvtsd2si r64, xmm
CPUID Flags: SSE2
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
Operation
dst[63:0] := Convert_FP64_To_Int64(a[63:0])
Performance
cvtsd2si
__int64 _mm_cvtsd_si64x (__m128d a)
Synopsis
__int64 _mm_cvtsd_si64x (__m128d a)
#include "emmintrin.h"
Instruction: cvtsd2si r64, xmm
CPUID Flags: SSE2
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
Operation
dst[63:0] := Convert_FP64_To_Int64(a[63:0])
Performance
cvtsd2ss
__m128 _mm_cvtsd_ss (__m128 a, __m128d b)
Synopsis
__m128 _mm_cvtsd_ss (__m128 a, __m128d b)
#include "emmintrin.h"
Instruction: cvtsd2ss xmm, xmm
CPUID Flags: SSE2
Description
Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[31:0] := Convert_FP64_To_FP32(b[63:0])
dst[127:32] := a[127:31]
dst[MAX:64] := 0
Performance
vcvtsd2ss
__m128 _mm_mask_cvtsd_ss (__m128 src, __mmask8 k, __m128 a, __m128d b)
Synopsis
__m128 _mm_mask_cvtsd_ss (__m128 src, __mmask8 k, __m128 a, __m128d b)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[31:0] := Convert_FP64_To_FP32(b[63:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:31]
dst[MAX:64] := 0
vcvtsd2ss
__m128 _mm_maskz_cvtsd_ss (__mmask8 k, __m128 a, __m128d b)
Synopsis
__m128 _mm_maskz_cvtsd_ss (__mmask8 k, __m128 a, __m128d b)
#include "immintrin.h"
Instruction: vcvtsd2ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := Convert_FP64_To_FP32(b[63:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:31]
dst[MAX:64] := 0
vcvtsd2usi
unsigned int _mm_cvtsd_u32 (__m128d a)
Synopsis
unsigned int _mm_cvtsd_u32 (__m128d a)
#include "immintrin.h"
Instruction: vcvtsd2usi r32, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
Operation
dst[31:0] := Convert_FP64_To_UnsignedInt32(a[63:0])
vcvtsd2usi
unsigned __int64 _mm_cvtsd_u64 (__m128d a)
Synopsis
unsigned __int64 _mm_cvtsd_u64 (__m128d a)
#include "immintrin.h"
Instruction: vcvtsd2usi r64, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
Operation
dst[63:0] := Convert_FP64_To_UnsignedInt64(a[63:0])
vpmovswb
__m128i _mm_cvtsepi16_epi8 (__m128i a)
Synopsis
__m128i _mm_cvtsepi16_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:64] := 0
vpmovswb
__m128i _mm_mask_cvtsepi16_epi8 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtsepi16_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovswb
__m128i _mm_maskz_cvtsepi16_epi8 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtsepi16_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovswb
__m128i _mm256_cvtsepi16_epi8 (__m256i a)
Synopsis
__m128i _mm256_cvtsepi16_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:128] := 0
vpmovswb
__m128i _mm256_mask_cvtsepi16_epi8 (__m128i src, __mmask16 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtsepi16_epi8 (__m128i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovswb
__m128i _mm256_maskz_cvtsepi16_epi8 (__mmask16 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtsepi16_epi8 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovswb
__m256i _mm512_cvtsepi16_epi8 (__m512i a)
Synopsis
__m256i _mm512_cvtsepi16_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:256] := 0
vpmovswb
__m256i _mm512_mask_cvtsepi16_epi8 (__m256i src, __mmask32 k, __m512i a)
Synopsis
__m256i _mm512_mask_cvtsepi16_epi8 (__m256i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovswb
__m256i _mm512_maskz_cvtsepi16_epi8 (__mmask32 k, __m512i a)
Synopsis
__m256i _mm512_maskz_cvtsepi16_epi8 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovswb
void _mm_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_Int16_To_Int8(a[i+15:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovswb
void _mm256_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)
Synopsis
void _mm256_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_Int16_To_Int8(a[i+15:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovswb
void _mm512_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)
Synopsis
void _mm512_mask_cvtsepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_Int16_To_Int8(a[i+15:i])
FI
ENDFOR
dst[MAX:256] := 0
vpmovsdw
__m128i _mm_cvtsepi32_epi16 (__m128i a)
Synopsis
__m128i _mm_cvtsepi32_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 16*j
dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:64] := 0
vpmovsdw
__m128i _mm_mask_cvtsepi32_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtsepi32_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovsdw
__m128i _mm_maskz_cvtsepi32_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtsepi32_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovsdw
__m128i _mm256_cvtsepi32_epi16 (__m256i a)
Synopsis
__m128i _mm256_cvtsepi32_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 16*j
dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vpmovsdw
__m128i _mm256_mask_cvtsepi32_epi16 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtsepi32_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsdw
__m128i _mm256_maskz_cvtsepi32_epi16 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtsepi32_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsdw
__m256i _mm512_cvtsepi32_epi16 (__m512i a)
Synopsis
__m256i _mm512_cvtsepi32_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsdw ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 16*j
dst[k+15:k] := Saturate_Int32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vpmovsdw
__m256i _mm512_mask_cvtsepi32_epi16 (__m256i src, __mmask16 k, __m512i a)
Synopsis
__m256i _mm512_mask_cvtsepi32_epi16 (__m256i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdw ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovsdw
__m256i _mm512_maskz_cvtsepi32_epi16 (__mmask16 k, __m512i a)
Synopsis
__m256i _mm512_maskz_cvtsepi32_epi16 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdw ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovsdb
__m128i _mm_cvtsepi32_epi8 (__m128i a)
Synopsis
__m128i _mm_cvtsepi32_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 8*j
dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:32] := 0
vpmovsdb
__m128i _mm_mask_cvtsepi32_epi8 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtsepi32_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:32] := 0
vpmovsdb
__m128i _mm_maskz_cvtsepi32_epi8 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtsepi32_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:32] := 0
vpmovsdb
__m128i _mm256_cvtsepi32_epi8 (__m256i a)
Synopsis
__m128i _mm256_cvtsepi32_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 8*j
dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:64] := 0
vpmovsdb
__m128i _mm256_mask_cvtsepi32_epi8 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtsepi32_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovsdb
__m128i _mm256_maskz_cvtsepi32_epi8 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtsepi32_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovsdb
__m128i _mm512_cvtsepi32_epi8 (__m512i a)
Synopsis
__m128i _mm512_cvtsepi32_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsdb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 8*j
dst[k+7:k] := Saturate_Int32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vpmovsdb
__m128i _mm512_mask_cvtsepi32_epi8 (__m128i src, __mmask16 k, __m512i a)
Synopsis
__m128i _mm512_mask_cvtsepi32_epi8 (__m128i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsdb
__m128i _mm512_maskz_cvtsepi32_epi8 (__mmask16 k, __m512i a)
Synopsis
__m128i _mm512_maskz_cvtsepi32_epi8 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsdw
void _mm_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_Int32_To_Int16(a[i+31:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovsdw
void _mm256_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_Int32_To_Int16(a[i+31:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovsdw
void _mm512_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)
Synopsis
void _mm512_mask_cvtsepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdw m256 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_Int32_To_Int16(a[i+31:i])
FI
ENDFOR
vpmovsdb
void _mm_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_Int32_To_Int8(a[i+31:i])
FI
ENDFOR
dst[MAX:32] := 0
vpmovsdb
void _mm256_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_Int32_To_Int8(a[i+31:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovsdb
void _mm512_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)
Synopsis
void _mm512_mask_cvtsepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsdb m128 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_Int32_To_Int8(a[i+31:i])
FI
ENDFOR
vpmovsqw
__m128i _mm_cvtsepi64_epi16 (__m128i a)
Synopsis
__m128i _mm_cvtsepi64_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 16*j
dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:32] := 0
vpmovsqw
__m128i _mm_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:32] := 0
vpmovsqw
__m128i _mm_maskz_cvtsepi64_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtsepi64_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:32] := 0
vpmovsqw
__m128i _mm256_cvtsepi64_epi16 (__m256i a)
Synopsis
__m128i _mm256_cvtsepi64_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 16*j
dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vpmovsqw
__m128i _mm256_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovsqw
__m128i _mm256_maskz_cvtsepi64_epi16 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtsepi64_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovsqw
__m128i _mm512_cvtsepi64_epi16 (__m512i a)
Synopsis
__m128i _mm512_cvtsepi64_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsqw xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 16*j
dst[k+15:k] := Saturate_Int64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpmovsqw
__m128i _mm512_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m512i a)
Synopsis
__m128i _mm512_mask_cvtsepi64_epi16 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqw xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsqw
__m128i _mm512_maskz_cvtsepi64_epi16 (__mmask8 k, __m512i a)
Synopsis
__m128i _mm512_maskz_cvtsepi64_epi16 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqw xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_Int64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsqd
__m128i _mm_cvtsepi64_epi32 (__m128i a)
Synopsis
__m128i _mm_cvtsepi64_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 32*j
dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vpmovsqd
__m128i _mm_mask_cvtsepi64_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtsepi64_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovsqd
__m128i _mm_maskz_cvtsepi64_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtsepi64_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovsqd
__m128i _mm256_cvtsepi64_epi32 (__m256i a)
Synopsis
__m128i _mm256_cvtsepi64_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 32*j
dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpmovsqd
__m128i _mm256_mask_cvtsepi64_epi32 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtsepi64_epi32 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovsqd
__m128i _mm256_maskz_cvtsepi64_epi32 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtsepi64_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovsqd
__m256i _mm512_cvtsepi64_epi32 (__m512i a)
Synopsis
__m256i _mm512_cvtsepi64_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsqd ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 32*j
dst[k+31:k] := Saturate_Int64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vpmovsqd
__m256i _mm512_mask_cvtsepi64_epi32 (__m256i src, __mmask8 k, __m512i a)
Synopsis
__m256i _mm512_mask_cvtsepi64_epi32 (__m256i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqd ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovsqd
__m256i _mm512_maskz_cvtsepi64_epi32 (__mmask8 k, __m512i a)
Synopsis
__m256i _mm512_maskz_cvtsepi64_epi32 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqd ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_Int64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovsqb
__m128i _mm_cvtsepi64_epi8 (__m128i a)
Synopsis
__m128i _mm_cvtsepi64_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 8*j
dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:16] := 0
vpmovsqb
__m128i _mm_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:16] := 0
vpmovsqb
__m128i _mm_maskz_cvtsepi64_epi8 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtsepi64_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:16] := 0
vpmovsqb
__m128i _mm256_cvtsepi64_epi8 (__m256i a)
Synopsis
__m128i _mm256_cvtsepi64_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 8*j
dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:32] := 0
vpmovsqb
__m128i _mm256_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:32] := 0
vpmovsqb
__m128i _mm256_maskz_cvtsepi64_epi8 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtsepi64_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:32] := 0
vpmovsqb
__m128i _mm512_cvtsepi64_epi8 (__m512i a)
Synopsis
__m128i _mm512_cvtsepi64_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovsqb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 8*j
dst[k+7:k] := Saturate_Int64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vpmovsqb
__m128i _mm512_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m512i a)
Synopsis
__m128i _mm512_mask_cvtsepi64_epi8 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovsqb
__m128i _mm512_maskz_cvtsepi64_epi8 (__mmask8 k, __m512i a)
Synopsis
__m128i _mm512_maskz_cvtsepi64_epi8 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_Int64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovsqw
void _mm_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_Int64_To_Int16(a[i+63:i])
FI
ENDFOR
dst[MAX:32] := 0
vpmovsqw
void _mm256_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_Int64_To_Int16(a[i+63:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovsqw
void _mm512_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_cvtsepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqw m128 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_Int64_To_Int16(a[i+63:i])
FI
ENDFOR
vpmovsqd
void _mm_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
MEM[base_addr+l+31:base_addr+l] := Saturate_Int64_To_Int32(a[i+63:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovsqd
void _mm256_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
MEM[base_addr+l+31:base_addr+l] := Saturate_Int64_To_Int32(a[i+63:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovsqd
void _mm512_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_cvtsepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqd m256 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
MEM[base_addr+l+31:base_addr+l] := Saturate_Int64_To_Int32(a[i+63:i])
FI
ENDFOR
vpmovsqb
void _mm_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_Int64_To_Int8(a[i+63:i])
FI
ENDFOR
dst[MAX:16] := 0
vpmovsqb
void _mm256_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovsqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_Int64_To_Int8(a[i+63:i])
FI
ENDFOR
dst[MAX:32] := 0
vpmovsqb
void _mm512_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_cvtsepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovsqb m64 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_Int64_To_Int8(a[i+63:i])
FI
ENDFOR
...
float _cvtsh_ss (unsigned short a)
Synopsis
float _cvtsh_ss (unsigned short a)
#include "emmintrin.h"
Description
Convert the half-precision (16-bit) floating-point value a to a single-precision (32-bit) floating-point value, and store the result in dst.
Operation
dst[31:0] := Convert_FP16_To_FP32(a[15:0])
movd
int _mm_cvtsi128_si32 (__m128i a)
Synopsis
int _mm_cvtsi128_si32 (__m128i a)
#include "emmintrin.h"
Instruction: movd r32, xmm
CPUID Flags: SSE2
Description
Copy the lower 32-bit integer in a to dst.
Operation
dst[31:0] := a[31:0]
movq
__int64 _mm_cvtsi128_si64 (__m128i a)
Synopsis
__int64 _mm_cvtsi128_si64 (__m128i a)
#include "emmintrin.h"
Instruction: movq r64, xmm
CPUID Flags: SSE2
Description
Copy the lower 64-bit integer in a to dst.
Operation
dst[63:0] := a[63:0]
Performance
movq
__int64 _mm_cvtsi128_si64x (__m128i a)
Synopsis
__int64 _mm_cvtsi128_si64x (__m128i a)
#include "emmintrin.h"
Instruction: movq r64, xmm
CPUID Flags: SSE2
Description
Copy the lower 64-bit integer in a to dst.
Operation
dst[63:0] := a[63:0]
Performance
cvtsi2sd
__m128d _mm_cvtsi32_sd (__m128d a, int b)
Synopsis
__m128d _mm_cvtsi32_sd (__m128d a, int b)
#include "emmintrin.h"
Instruction: cvtsi2sd xmm, r32
CPUID Flags: SSE2
Description
Convert the 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := Convert_Int32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
movd
__m128i _mm_cvtsi32_si128 (int a)
Synopsis
__m128i _mm_cvtsi32_si128 (int a)
#include "emmintrin.h"
Instruction: movd xmm, r32
CPUID Flags: SSE2
Description
Copy 32-bit integer a to the lower elements of dst, and zero the upper elements of dst.
Operation
dst[31:0] := a[31:0]
dst[127:32] := 0
cvtsi2ss
__m128 _mm_cvtsi32_ss (__m128 a, int b)
Synopsis
__m128 _mm_cvtsi32_ss (__m128 a, int b)
#include "xmmintrin.h"
Instruction: cvtsi2ss xmm, r32
CPUID Flags: SSE
Description
Convert the 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
Performance
cvtsi2sd
__m128d _mm_cvtsi64_sd (__m128d a, __int64 b)
Synopsis
__m128d _mm_cvtsi64_sd (__m128d a, __int64 b)
#include "emmintrin.h"
Instruction: cvtsi2sd xmm, r64
CPUID Flags: SSE2
Description
Convert the 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := Convert_Int64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
movq
__m128i _mm_cvtsi64_si128 (__int64 a)
Synopsis
__m128i _mm_cvtsi64_si128 (__int64 a)
#include "emmintrin.h"
Instruction: movq xmm, r64
CPUID Flags: SSE2
Description
Copy 64-bit integer a to the lower element of dst, and zero the upper element.
Operation
dst[63:0] := a[63:0]
dst[127:64] := 0
Performance
cvtsi2ss
__m128 _mm_cvtsi64_ss (__m128 a, __int64 b)
Synopsis
__m128 _mm_cvtsi64_ss (__m128 a, __int64 b)
#include "xmmintrin.h"
Instruction: cvtsi2ss xmm, r64
CPUID Flags: SSE
Description
Convert the 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := Convert_Int64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
Performance
cvtsi2sd
__m128d _mm_cvtsi64x_sd (__m128d a, __int64 b)
Synopsis
__m128d _mm_cvtsi64x_sd (__m128d a, __int64 b)
#include "emmintrin.h"
Instruction: cvtsi2sd xmm, r64
CPUID Flags: SSE2
Description
Convert the 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := Convert_Int64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
movq
__m128i _mm_cvtsi64x_si128 (__int64 a)
Synopsis
__m128i _mm_cvtsi64x_si128 (__int64 a)
#include "emmintrin.h"
Instruction: movq xmm, r64
CPUID Flags: SSE2
Description
Copy 64-bit integer a to the lower element of dst, and zero the upper element.
Operation
dst[63:0] := a[63:0]
dst[127:64] := 0
Performance
movss
float _mm_cvtss_f32 (__m128 a)
Synopsis
float _mm_cvtss_f32 (__m128 a)
#include "xmmintrin.h"
Instruction: movss m32, xmm
CPUID Flags: SSE
Description
Copy the lower single-precision (32-bit) floating-point element of a to dst.
Operation
dst[31:0] := a[31:0]
vcvtss2si
int _mm_cvtss_i32 (__m128 a)
Synopsis
int _mm_cvtss_i32 (__m128 a)
#include "immintrin.h"
Instruction: vcvtss2si r32, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
Operation
dst[31:0] := Convert_FP32_To_Int32(a[31:0])
vcvtss2si
__int64 _mm_cvtss_i64 (__m128 a)
Synopsis
__int64 _mm_cvtss_i64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtss2si r64, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
Operation
dst[63:0] := Convert_FP32_To_Int64(a[31:0])
cvtss2sd
__m128d _mm_cvtss_sd (__m128d a, __m128 b)
Synopsis
__m128d _mm_cvtss_sd (__m128d a, __m128 b)
#include "emmintrin.h"
Instruction: cvtss2sd xmm, xmm
CPUID Flags: SSE2
Description
Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := Convert_FP32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:64] := 0
Performance
vcvtss2sd
__m128d _mm_mask_cvtss_sd (__m128d src, __mmask8 k, __m128d a, __m128 b)
Synopsis
__m128d _mm_mask_cvtss_sd (__m128d src, __mmask8 k, __m128d a, __m128 b)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := Convert_FP32_To_FP64(b[31:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:64] := 0
vcvtss2sd
__m128d _mm_maskz_cvtss_sd (__mmask8 k, __m128d a, __m128 b)
Synopsis
__m128d _mm_maskz_cvtss_sd (__mmask8 k, __m128d a, __m128 b)
#include "immintrin.h"
Instruction: vcvtss2sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := Convert_FP32_To_FP64(b[31:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:64] := 0
...
unsigned short _cvtss_sh (float a, int imm8)
Synopsis
unsigned short _cvtss_sh (float a, int imm8)
#include "emmintrin.h"
Description
Convert the single-precision (32-bit) floating-point value a to a half-precision (16-bit) floating-point value, and store the result in dst.
Operation
dst[15:0] := Convert_FP32_To_FP16(a[31:0])
cvtss2si
int _mm_cvtss_si32 (__m128 a)
Synopsis
int _mm_cvtss_si32 (__m128 a)
#include "xmmintrin.h"
Instruction: cvtss2si r32, xmm
CPUID Flags: SSE
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
Operation
dst[31:0] := Convert_FP32_To_Int32(a[31:0])
cvtss2si
__int64 _mm_cvtss_si64 (__m128 a)
Synopsis
__int64 _mm_cvtss_si64 (__m128 a)
#include "xmmintrin.h"
Instruction: cvtss2si r64, xmm
CPUID Flags: SSE
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer, and store the result in dst.
Operation
dst[63:0] := Convert_FP32_To_Int64(a[31:0])
Performance
vcvtss2usi
unsigned int _mm_cvtss_u32 (__m128 a)
Synopsis
unsigned int _mm_cvtss_u32 (__m128 a)
#include "immintrin.h"
Instruction: vcvtss2usi r32, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
Operation
dst[31:0] := Convert_FP32_To_UnsignedInt32(a[31:0])
vcvtss2usi
unsigned __int64 _mm_cvtss_u64 (__m128 a)
Synopsis
unsigned __int64 _mm_cvtss_u64 (__m128 a)
#include "immintrin.h"
Instruction: vcvtss2usi r64, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer, and store the result in dst.
Operation
dst[63:0] := Convert_FP32_To_UnsignedInt64(a[31:0])
cvttps2pi
__m64 _mm_cvtt_ps2pi (__m128 a)
Synopsis
__m64 _mm_cvtt_ps2pi (__m128 a)
#include "xmmintrin.h"
Instruction: cvttps2pi mm, xmm
CPUID Flags: SSE
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
Performance
vcvttpd2dq
__m256i _mm512_cvtt_roundpd_epi32 (__m512d a, int sae)
Synopsis
__m256i _mm512_cvtt_roundpd_epi32 (__m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := 32*i
k := 64*j
dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvttpd2dq
__m256i _mm512_mask_cvtt_roundpd_epi32 (__m256i src, __mmask8 k, __m512d a, int sae)
Synopsis
__m256i _mm512_mask_cvtt_roundpd_epi32 (__m256i src, __mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := 32*i
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2dq
__m256i _mm512_maskz_cvtt_roundpd_epi32 (__mmask8 k, __m512d a, int sae)
Synopsis
__m256i _mm512_maskz_cvtt_roundpd_epi32 (__mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := 32*i
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_IntegerTruncate(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2qq
__m512i _mm512_cvtt_roundpd_epi64 (__m512d a, int sae)
Synopsis
__m512i _mm512_cvtt_roundpd_epi64 (__m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvttpd2qq
__m512i _mm512_mask_cvtt_roundpd_epi64 (__m512i src, __mmask8 k, __m512d a, int sae)
Synopsis
__m512i _mm512_mask_cvtt_roundpd_epi64 (__m512i src, __mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttpd2qq
__m512i _mm512_maskz_cvtt_roundpd_epi64 (__mmask8 k, __m512d a, int sae)
Synopsis
__m512i _mm512_maskz_cvtt_roundpd_epi64 (__mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttpd2udq
__m256i _mm512_cvtt_roundpd_epu32 (__m512d a, int sae)
Synopsis
__m256i _mm512_cvtt_roundpd_epu32 (__m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := 32*i
k := 64*j
dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvttpd2udq
__m256i _mm512_mask_cvtt_roundpd_epu32 (__m256i src, __mmask8 k, __m512d a, int sae)
Synopsis
__m256i _mm512_mask_cvtt_roundpd_epu32 (__m256i src, __mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := 32*i
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2udq
__m256i _mm512_maskz_cvtt_roundpd_epu32 (__mmask8 k, __m512d a, int sae)
Synopsis
__m256i _mm512_maskz_cvtt_roundpd_epu32 (__mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := 32*i
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedIntegerTruncate(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2uqq
__m512i _mm512_cvtt_roundpd_epu64 (__m512d a, int sae)
Synopsis
__m512i _mm512_cvtt_roundpd_epu64 (__m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvttpd2uqq
__m512i _mm512_mask_cvtt_roundpd_epu64 (__m512i src, __mmask8 k, __m512d a, int sae)
Synopsis
__m512i _mm512_mask_cvtt_roundpd_epu64 (__m512i src, __mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttpd2uqq
__m512i _mm512_maskz_cvtt_roundpd_epu64 (__mmask8 k, __m512d a, int sae)
Synopsis
__m512i _mm512_maskz_cvtt_roundpd_epu64 (__mmask8 k, __m512d a, int sae)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_cvtt_roundps_epi32 (__m512 a, int sae)
Synopsis
__m512i _mm512_cvtt_roundps_epi32 (__m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := 32*i
dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_mask_cvtt_roundps_epi32 (__m512i src, __mmask16 k, __m512 a, int sae)
Synopsis
__m512i _mm512_mask_cvtt_roundps_epi32 (__m512i src, __mmask16 k, __m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := 32*i
IF k[j]
dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_maskz_cvtt_roundps_epi32 (__mmask16 k, __m512 a, int sae)
Synopsis
__m512i _mm512_maskz_cvtt_roundps_epi32 (__mmask16 k, __m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := 32*i
IF k[j]
dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_cvtt_roundps_epi64 (__m256 a, int sae)
Synopsis
__m512i _mm512_cvtt_roundps_epi64 (__m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_mask_cvtt_roundps_epi64 (__m512i src, __mmask8 k, __m256 a, int sae)
Synopsis
__m512i _mm512_mask_cvtt_roundps_epi64 (__m512i src, __mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_maskz_cvtt_roundps_epi64 (__mmask8 k, __m256 a, int sae)
Synopsis
__m512i _mm512_maskz_cvtt_roundps_epi64 (__mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_cvtt_roundps_epu32 (__m512 a, int sae)
Synopsis
__m512i _mm512_cvtt_roundps_epu32 (__m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := 32*i
dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_mask_cvtt_roundps_epu32 (__m512i src, __mmask16 k, __m512 a, int sae)
Synopsis
__m512i _mm512_mask_cvtt_roundps_epu32 (__m512i src, __mmask16 k, __m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := 32*i
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_maskz_cvtt_roundps_epu32 (__mmask16 k, __m512 a, int sae)
Synopsis
__m512i _mm512_maskz_cvtt_roundps_epu32 (__mmask16 k, __m512 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm {sae}
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := 32*i
IF k[j]
dst[i+31:i] := Convert_FP32_To_UnsignedIntegerTruncate(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_cvtt_roundps_epu64 (__m256 a, int sae)
Synopsis
__m512i _mm512_cvtt_roundps_epu64 (__m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst. Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_mask_cvtt_roundps_epu64 (__m512i src, __mmask8 k, __m256 a, int sae)
Synopsis
__m512i _mm512_mask_cvtt_roundps_epu64 (__m512i src, __mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_maskz_cvtt_roundps_epu64 (__mmask8 k, __m256 a, int sae)
Synopsis
__m512i _mm512_maskz_cvtt_roundps_epu64 (__mmask8 k, __m256 a, int sae)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttsd2si
int _mm_cvtt_roundsd_i32 (__m128d a, int rounding)
Synopsis
int _mm_cvtt_roundsd_i32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2si r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to a 32-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
vcvttsd2si
__int64 _mm_cvtt_roundsd_i64 (__m128d a, int rounding)
Synopsis
__int64 _mm_cvtt_roundsd_i64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2si r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to a 64-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
vcvttsd2si
int _mm_cvtt_roundsd_si32 (__m128d a, int rounding)
Synopsis
int _mm_cvtt_roundsd_si32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2si r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to a 32-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
vcvttsd2si
__int64 _mm_cvtt_roundsd_si64 (__m128d a, int rounding)
Synopsis
__int64 _mm_cvtt_roundsd_si64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2si r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to a 64-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
vcvttsd2usi
unsigned int _mm_cvtt_roundsd_u32 (__m128d a, int rounding)
Synopsis
unsigned int _mm_cvtt_roundsd_u32 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2usi r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to an unsigned 32-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])
vcvttsd2usi
unsigned __int64 _mm_cvtt_roundsd_u64 (__m128d a, int rounding)
Synopsis
unsigned __int64 _mm_cvtt_roundsd_u64 (__m128d a, int rounding)
#include "immintrin.h"
Instruction: vcvttsd2usi r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in
a to an unsigned 64-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])
vcvttss2si
int _mm_cvtt_roundss_i32 (__m128 a, int rounding)
Synopsis
int _mm_cvtt_roundss_i32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2si r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to a 32-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
vcvttss2si
__int64 _mm_cvtt_roundss_i64 (__m128 a, int rounding)
Synopsis
__int64 _mm_cvtt_roundss_i64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2si r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to a 64-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
vcvttss2si
int _mm_cvtt_roundss_si32 (__m128 a, int rounding)
Synopsis
int _mm_cvtt_roundss_si32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2si r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to a 32-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
vcvttss2si
__int64 _mm_cvtt_roundss_si64 (__m128 a, int rounding)
Synopsis
__int64 _mm_cvtt_roundss_si64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2si r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to a 64-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
vcvttss2usi
unsigned int _mm_cvtt_roundss_u32 (__m128 a, int rounding)
Synopsis
unsigned int _mm_cvtt_roundss_u32 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2usi r32, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to an unsigned 32-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])
vcvttss2usi
unsigned __int64 _mm_cvtt_roundss_u64 (__m128 a, int rounding)
Synopsis
unsigned __int64 _mm_cvtt_roundss_u64 (__m128 a, int rounding)
#include "immintrin.h"
Instruction: vcvttss2usi r64, xmm {er}
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in
a to an unsigned 64-bit integer with truncation, and store the result in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])
cvttss2si
int _mm_cvtt_ss2si (__m128 a)
Synopsis
int _mm_cvtt_ss2si (__m128 a)
#include "xmmintrin.h"
Instruction: cvttss2si r32, xmm
CPUID Flags: SSE
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
Operation
dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
cvttpd2dq
__m128i _mm_cvttpd_epi32 (__m128d a)
Synopsis
__m128i _mm_cvttpd_epi32 (__m128d a)
#include "emmintrin.h"
Instruction: cvttpd2dq xmm, xmm
CPUID Flags: SSE2
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
ENDFOR
Performance
vcvttpd2dq
__m128i _mm_mask_cvttpd_epi32 (__m128i src, __mmask8 k, __m128d a)
Synopsis
__m128i _mm_mask_cvttpd_epi32 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:64] := 0
vcvttpd2dq
__m128i _mm_maskz_cvttpd_epi32 (__mmask8 k, __m128d a)
Synopsis
__m128i _mm_maskz_cvttpd_epi32 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:64] := 0
vcvttpd2dq
__m128i _mm256_cvttpd_epi32 (__m256d a)
Synopsis
__m128i _mm256_cvttpd_epi32 (__m256d a)
#include "immintrin.h"
Instruction: vcvttpd2dq xmm, ymm
CPUID Flags: AVX
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:128] := 0
Performance
vcvttpd2dq
__m128i _mm256_mask_cvttpd_epi32 (__m128i src, __mmask8 k, __m256d a)
Synopsis
__m128i _mm256_mask_cvttpd_epi32 (__m128i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvttpd2dq
__m128i _mm256_maskz_cvttpd_epi32 (__mmask8 k, __m256d a)
Synopsis
__m128i _mm256_maskz_cvttpd_epi32 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvttpd2dq
__m256i _mm512_cvttpd_epi32 (__m512d a)
Synopsis
__m256i _mm512_cvttpd_epi32 (__m512d a)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvttpd2dq
__m256i _mm512_mask_cvttpd_epi32 (__m256i src, __mmask8 k, __m512d a)
Synopsis
__m256i _mm512_mask_cvttpd_epi32 (__m256i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2dq
__m256i _mm512_maskz_cvttpd_epi32 (__mmask8 k, __m512d a)
Synopsis
__m256i _mm512_maskz_cvttpd_epi32 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2dq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2qq
__m128i _mm_cvttpd_epi64 (__m128d a)
Synopsis
__m128i _mm_cvttpd_epi64 (__m128d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vcvttpd2qq
__m128i _mm_mask_cvttpd_epi64 (__m128i src, __mmask8 k, __m128d a)
Synopsis
__m128i _mm_mask_cvttpd_epi64 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvttpd2qq
__m128i _mm_maskz_cvttpd_epi64 (__mmask8 k, __m128d a)
Synopsis
__m128i _mm_maskz_cvttpd_epi64 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvttpd2qq
__m256i _mm256_cvttpd_epi64 (__m256d a)
Synopsis
__m256i _mm256_cvttpd_epi64 (__m256d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvttpd2qq
__m256i _mm256_mask_cvttpd_epi64 (__m256i src, __mmask8 k, __m256d a)
Synopsis
__m256i _mm256_mask_cvttpd_epi64 (__m256i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2qq
__m256i _mm256_maskz_cvttpd_epi64 (__mmask8 k, __m256d a)
Synopsis
__m256i _mm256_maskz_cvttpd_epi64 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2qq
__m512i _mm512_cvttpd_epi64 (__m512d a)
Synopsis
__m512i _mm512_cvttpd_epi64 (__m512d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvttpd2qq
__m512i _mm512_mask_cvttpd_epi64 (__m512i src, __mmask8 k, __m512d a)
Synopsis
__m512i _mm512_mask_cvttpd_epi64 (__m512i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttpd2qq
__m512i _mm512_maskz_cvttpd_epi64 (__mmask8 k, __m512d a)
Synopsis
__m512i _mm512_maskz_cvttpd_epi64 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2qq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_Int64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttpd2udq
__m128i _mm_cvttpd_epu32 (__m128d a)
Synopsis
__m128i _mm_cvttpd_epu32 (__m128d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:64] := 0
vcvttpd2udq
__m128i _mm_mask_cvttpd_epu32 (__m128i src, __mmask8 k, __m128d a)
Synopsis
__m128i _mm_mask_cvttpd_epu32 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:64] := 0
vcvttpd2udq
__m128i _mm_maskz_cvttpd_epu32 (__mmask8 k, __m128d a)
Synopsis
__m128i _mm_maskz_cvttpd_epu32 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:64] := 0
vcvttpd2udq
__m128i _mm256_cvttpd_epu32 (__m256d a)
Synopsis
__m128i _mm256_cvttpd_epu32 (__m256d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:128] := 0
vcvttpd2udq
__m128i _mm256_mask_cvttpd_epu32 (__m128i src, __mmask8 k, __m256d a)
Synopsis
__m128i _mm256_mask_cvttpd_epu32 (__m128i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvttpd2udq
__m128i _mm256_maskz_cvttpd_epu32 (__mmask8 k, __m256d a)
Synopsis
__m128i _mm256_maskz_cvttpd_epu32 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvttpd2udq
__m256i _mm512_cvttpd_epu32 (__m512d a)
Synopsis
__m256i _mm512_cvttpd_epu32 (__m512d a)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[k+63:k])
ENDFOR
dst[MAX:256] := 0
vcvttpd2udq
__m256i _mm512_mask_cvttpd_epu32 (__m256i src, __mmask8 k, __m512d a)
Synopsis
__m256i _mm512_mask_cvttpd_epu32 (__m256i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2udq
__m256i _mm512_maskz_cvttpd_epu32 (__mmask8 k, __m512d a)
Synopsis
__m256i _mm512_maskz_cvttpd_epu32 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2udq ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 64*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[l+63:l])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2uqq
__m128i _mm_cvttpd_epu64 (__m128d a)
Synopsis
__m128i _mm_cvttpd_epu64 (__m128d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vcvttpd2uqq
__m128i _mm_mask_cvttpd_epu64 (__m128i src, __mmask8 k, __m128d a)
Synopsis
__m128i _mm_mask_cvttpd_epu64 (__m128i src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvttpd2uqq
__m128i _mm_maskz_cvttpd_epu64 (__mmask8 k, __m128d a)
Synopsis
__m128i _mm_maskz_cvttpd_epu64 (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvttpd2uqq
__m256i _mm256_cvttpd_epu64 (__m256d a)
Synopsis
__m256i _mm256_cvttpd_epu64 (__m256d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vcvttpd2uqq
__m256i _mm256_mask_cvttpd_epu64 (__m256i src, __mmask8 k, __m256d a)
Synopsis
__m256i _mm256_mask_cvttpd_epu64 (__m256i src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2uqq
__m256i _mm256_maskz_cvttpd_epu64 (__mmask8 k, __m256d a)
Synopsis
__m256i _mm256_maskz_cvttpd_epu64 (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttpd2uqq
__m512i _mm512_cvttpd_epu64 (__m512d a)
Synopsis
__m512i _mm512_cvttpd_epu64 (__m512d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vcvttpd2uqq
__m512i _mm512_mask_cvttpd_epu64 (__m512i src, __mmask8 k, __m512d a)
Synopsis
__m512i _mm512_mask_cvttpd_epu64 (__m512i src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttpd2uqq
__m512i _mm512_maskz_cvttpd_epu64 (__mmask8 k, __m512d a)
Synopsis
__m512i _mm512_maskz_cvttpd_epu64 (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vcvttpd2uqq
CPUID Flags: AVX512DQ
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := Convert_FP64_To_UnsignedInt64_Truncate(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
cvttpd2pi
__m64 _mm_cvttpd_pi32 (__m128d a)
Synopsis
__m64 _mm_cvttpd_pi32 (__m128d a)
#include "emmintrin.h"
Instruction: cvttpd2pi mm, xmm
CPUID Flags: SSE2
Description
Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
k := 64*j
dst[i+31:i] := Convert_FP64_To_Int32_Truncate(a[k+63:k])
ENDFOR
Performance
cvttps2dq
__m128i _mm_cvttps_epi32 (__m128 a)
Synopsis
__m128i _mm_cvttps_epi32 (__m128 a)
#include "emmintrin.h"
Instruction: cvttps2dq xmm, xmm
CPUID Flags: SSE2
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
Performance
vcvttps2dq
__m128i _mm_mask_cvttps_epi32 (__m128i src, __mmask8 k, __m128 a)
Synopsis
__m128i _mm_mask_cvttps_epi32 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvttps2dq
__m128i _mm_maskz_cvttps_epi32 (__mmask8 k, __m128 a)
Synopsis
__m128i _mm_maskz_cvttps_epi32 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*i
IF k[j]
dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvttps2dq
__m256i _mm256_cvttps_epi32 (__m256 a)
Synopsis
__m256i _mm256_cvttps_epi32 (__m256 a)
#include "immintrin.h"
Instruction: vcvttps2dq ymm, ymm
CPUID Flags: AVX
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vcvttps2dq
__m256i _mm256_mask_cvttps_epi32 (__m256i src, __mmask8 k, __m256 a)
Synopsis
__m256i _mm256_mask_cvttps_epi32 (__m256i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttps2dq
__m256i _mm256_maskz_cvttps_epi32 (__mmask8 k, __m256 a)
Synopsis
__m256i _mm256_maskz_cvttps_epi32 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2dq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*i
IF k[j]
dst[i+31:i] := Convert_FP32_To_IntegerTruncate(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttps2dq
__m512i _mm512_cvttps_epi32 (__m512 a)
Synopsis
__m512i _mm512_cvttps_epi32 (__m512 a)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_mask_cvttps_epi32 (__m512i src, __mmask16 k, __m512 a)
Synopsis
__m512i _mm512_mask_cvttps_epi32 (__m512i src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2dq
__m512i _mm512_maskz_cvttps_epi32 (__mmask16 k, __m512 a)
Synopsis
__m512i _mm512_maskz_cvttps_epi32 (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvttps2dq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2qq
__m128i _mm_cvttps_epi64 (__m128 a)
Synopsis
__m128i _mm_cvttps_epi64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:128] := 0
vcvttps2qq
__m128i _mm_mask_cvttps_epi64 (__m128i src, __mmask8 k, __m128 a)
Synopsis
__m128i _mm_mask_cvttps_epi64 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvttps2qq
__m128i _mm_maskz_cvttps_epi64 (__mmask8 k, __m128 a)
Synopsis
__m128i _mm_maskz_cvttps_epi64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvttps2qq
__m256i _mm256_cvttps_epi64 (__m128 a)
Synopsis
__m256i _mm256_cvttps_epi64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:256] := 0
vcvttps2qq
__m256i _mm256_mask_cvttps_epi64 (__m256i src, __mmask8 k, __m128 a)
Synopsis
__m256i _mm256_mask_cvttps_epi64 (__m256i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttps2qq
__m256i _mm256_maskz_cvttps_epi64 (__mmask8 k, __m128 a)
Synopsis
__m256i _mm256_maskz_cvttps_epi64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttps2qq
__m512i _mm512_cvttps_epi64 (__m256 a)
Synopsis
__m512i _mm512_cvttps_epi64 (__m256 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_mask_cvttps_epi64 (__m512i src, __mmask8 k, __m256 a)
Synopsis
__m512i _mm512_mask_cvttps_epi64 (__m512i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2qq
__m512i _mm512_maskz_cvttps_epi64 (__mmask8 k, __m256 a)
Synopsis
__m512i _mm512_maskz_cvttps_epi64 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2qq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_Int64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2udq
__m128i _mm_cvttps_epu32 (__m128 a)
Synopsis
__m128i _mm_cvttps_epu32 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vcvttps2udq
__m128i _mm_mask_cvttps_epu32 (__m128i src, __mmask8 k, __m128 a)
Synopsis
__m128i _mm_mask_cvttps_epu32 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvttps2udq
__m128i _mm_maskz_cvttps_epu32 (__mmask8 k, __m128 a)
Synopsis
__m128i _mm_maskz_cvttps_epu32 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvttps2udq
__m256i _mm256_cvttps_epu32 (__m256 a)
Synopsis
__m256i _mm256_cvttps_epu32 (__m256 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vcvttps2udq
__m256i _mm256_mask_cvttps_epu32 (__m256i src, __mmask8 k, __m256 a)
Synopsis
__m256i _mm256_mask_cvttps_epu32 (__m256i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttps2udq
__m256i _mm256_maskz_cvttps_epu32 (__mmask8 k, __m256 a)
Synopsis
__m256i _mm256_maskz_cvttps_epu32 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2udq
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttps2udq
__m512i _mm512_cvttps_epu32 (__m512 a)
Synopsis
__m512i _mm512_cvttps_epu32 (__m512 a)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := Convert_FP32_To_UnsignedInt32_Truncate(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_mask_cvttps_epu32 (__m512i src, __mmask16 k, __m512 a)
Synopsis
__m512i _mm512_mask_cvttps_epu32 (__m512i src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2udq
__m512i _mm512_maskz_cvttps_epu32 (__mmask16 k, __m512 a)
Synopsis
__m512i _mm512_maskz_cvttps_epu32 (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vcvttps2udq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := Convert_FP64_To_UnsignedInt32_Truncate(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2uqq
__m128i _mm_cvttps_epu64 (__m128 a)
Synopsis
__m128i _mm_cvttps_epu64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:128] := 0
vcvttps2uqq
__m128i _mm_mask_cvttps_epu64 (__m128i src, __mmask8 k, __m128 a)
Synopsis
__m128i _mm_mask_cvttps_epu64 (__m128i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vcvttps2uqq
__m128i _mm_maskz_cvttps_epu64 (__mmask8 k, __m128 a)
Synopsis
__m128i _mm_maskz_cvttps_epu64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vcvttps2uqq
__m256i _mm256_cvttps_epu64 (__m128 a)
Synopsis
__m256i _mm256_cvttps_epu64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:256] := 0
vcvttps2uqq
__m256i _mm256_mask_cvttps_epu64 (__m256i src, __mmask8 k, __m128 a)
Synopsis
__m256i _mm256_mask_cvttps_epu64 (__m256i src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vcvttps2uqq
__m256i _mm256_maskz_cvttps_epu64 (__mmask8 k, __m128 a)
Synopsis
__m256i _mm256_maskz_cvttps_epu64 (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512VL + AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vcvttps2uqq
__m512i _mm512_cvttps_epu64 (__m256 a)
Synopsis
__m512i _mm512_cvttps_epu64 (__m256 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ENDFOR
dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_mask_cvttps_epu64 (__m512i src, __mmask8 k, __m256 a)
Synopsis
__m512i _mm512_mask_cvttps_epu64 (__m512i src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vcvttps2uqq
__m512i _mm512_maskz_cvttps_epu64 (__mmask8 k, __m256 a)
Synopsis
__m512i _mm512_maskz_cvttps_epu64 (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vcvttps2uqq
CPUID Flags: AVX512DQ
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 64-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
dst[i+63:i] := Convert_FP32_To_UnsignedInt64_Truncate(a[l+31:l])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
cvttps2pi
__m64 _mm_cvttps_pi32 (__m128 a)
Synopsis
__m64 _mm_cvttps_pi32 (__m128 a)
#include "xmmintrin.h"
Instruction: cvttps2pi mm, xmm
CPUID Flags: SSE
Description
Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 32*j
dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR
Performance
vcvttsd2si
int _mm_cvttsd_i32 (__m128d a)
Synopsis
int _mm_cvttsd_i32 (__m128d a)
#include "immintrin.h"
Instruction: vcvttsd2si r32, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
Operation
dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
vcvttsd2si
__int64 _mm_cvttsd_i64 (__m128d a)
Synopsis
__int64 _mm_cvttsd_i64 (__m128d a)
#include "immintrin.h"
Instruction: vcvttsd2si r64, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
Operation
dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
cvttsd2si
int _mm_cvttsd_si32 (__m128d a)
Synopsis
int _mm_cvttsd_si32 (__m128d a)
#include "emmintrin.h"
Instruction: cvttsd2si r32, xmm
CPUID Flags: SSE2
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
Operation
dst[31:0] := Convert_FP64_To_Int32_Truncate(a[63:0])
cvttsd2si
__int64 _mm_cvttsd_si64 (__m128d a)
Synopsis
__int64 _mm_cvttsd_si64 (__m128d a)
#include "emmintrin.h"
Instruction: cvttsd2si r64, xmm
CPUID Flags: SSE2
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
Operation
dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
Performance
cvttsd2si
__int64 _mm_cvttsd_si64x (__m128d a)
Synopsis
__int64 _mm_cvttsd_si64x (__m128d a)
#include "emmintrin.h"
Instruction: cvttsd2si r64, xmm
CPUID Flags: SSE2
Description
Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
Operation
dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
Performance
vcvttsd2usi
unsigned int _mm_cvttsd_u32 (__m128d a)
Synopsis
unsigned int _mm_cvttsd_u32 (__m128d a)
#include "immintrin.h"
Instruction: vcvttsd2usi r32, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
Operation
dst[31:0] := Convert_FP64_To_UnsignedInt32_Truncate(a[63:0])
vcvttsd2usi
unsigned __int64 _mm_cvttsd_u64 (__m128d a)
Synopsis
unsigned __int64 _mm_cvttsd_u64 (__m128d a)
#include "immintrin.h"
Instruction: vcvttsd2usi r64, xmm
CPUID Flags: AVX512F
Description
Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
Operation
dst[63:0] := Convert_FP64_To_UnsignedInt64_Truncate(a[63:0])
vcvttss2si
int _mm_cvttss_i32 (__m128 a)
Synopsis
int _mm_cvttss_i32 (__m128 a)
#include "immintrin.h"
Instruction: vcvttss2si r32, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
Operation
dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
vcvttss2si
__int64 _mm_cvttss_i64 (__m128 a)
Synopsis
__int64 _mm_cvttss_i64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttss2si r64, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
Operation
dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
cvttss2si
int _mm_cvttss_si32 (__m128 a)
Synopsis
int _mm_cvttss_si32 (__m128 a)
#include "xmmintrin.h"
Instruction: cvttss2si r32, xmm
CPUID Flags: SSE
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
Operation
dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
cvttss2si
__int64 _mm_cvttss_si64 (__m128 a)
Synopsis
__int64 _mm_cvttss_si64 (__m128 a)
#include "xmmintrin.h"
Instruction: cvttss2si r64, xmm
CPUID Flags: SSE
Description
Convert the lower single-precision (32-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
Operation
dst[63:0] := Convert_FP64_To_Int32_Truncate(a[31:0])
Performance
vcvttss2usi
unsigned int _mm_cvttss_u32 (__m128 a)
Synopsis
unsigned int _mm_cvttss_u32 (__m128 a)
#include "immintrin.h"
Instruction: vcvttss2usi r32, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
Operation
dst[31:0] := Convert_FP32_To_UnsignedInt32_Truncate(a[31:0])
vcvttss2usi
unsigned __int64 _mm_cvttss_u64 (__m128 a)
Synopsis
unsigned __int64 _mm_cvttss_u64 (__m128 a)
#include "immintrin.h"
Instruction: vcvttss2usi r64, xmm
CPUID Flags: AVX512F
Description
Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 64-bit integer with truncation, and store the result in dst.
Operation
dst[63:0] := Convert_FP32_To_UnsignedInt64_Truncate(a[31:0])
vcvtusi2sd
__m128d _mm_cvtu32_sd (__m128d a, unsigned int b)
Synopsis
__m128d _mm_cvtu32_sd (__m128d a, unsigned int b)
#include "immintrin.h"
Instruction: vcvtusi2sd xmm, xmm, r32
CPUID Flags: AVX512F
Description
Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := Convert_UnsignedInt32_To_FP64(b[31:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vcvtusi2ss
__m128 _mm_cvtu32_ss (__m128 a, unsigned int b)
Synopsis
__m128 _mm_cvtu32_ss (__m128 a, unsigned int b)
#include "immintrin.h"
Instruction: vcvtusi2ss xmm, xmm, r32 {er}
CPUID Flags: AVX512F
Description
Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := Convert_UnsignedInt32_To_FP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vcvtusi2sd
__m128d _mm_cvtu64_sd (__m128d a, unsigned __int64 b)
Synopsis
__m128d _mm_cvtu64_sd (__m128d a, unsigned __int64 b)
#include "immintrin.h"
Instruction: vcvtusi2sd xmm, xmm, r64
CPUID Flags: AVX512F
Description
Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := Convert_UnsignedInt64_To_FP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vcvtusi2ss
__m128 _mm_cvtu64_ss (__m128 a, unsigned __int64 b)
Synopsis
__m128 _mm_cvtu64_ss (__m128 a, unsigned __int64 b)
#include "immintrin.h"
Instruction: vcvtusi2ss xmm, xmm, r64
CPUID Flags: AVX512F
Description
Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := Convert_UnsignedInt64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vpmovuswb
__m128i _mm_cvtusepi16_epi8 (__m128i a)
Synopsis
__m128i _mm_cvtusepi16_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:64] := 0
vpmovuswb
__m128i _mm_mask_cvtusepi16_epi8 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtusepi16_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovuswb
__m128i _mm_maskz_cvtusepi16_epi8 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtusepi16_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovuswb
__m128i _mm256_cvtusepi16_epi8 (__m256i a)
Synopsis
__m128i _mm256_cvtusepi16_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:128] := 0
vpmovuswb
__m128i _mm256_mask_cvtusepi16_epi8 (__m128i src, __mmask16 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtusepi16_epi8 (__m128i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovuswb
__m128i _mm256_maskz_cvtusepi16_epi8 (__mmask16 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtusepi16_epi8 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovuswb
__m256i _mm512_cvtusepi16_epi8 (__m512i a)
Synopsis
__m256i _mm512_cvtusepi16_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:256] := 0
vpmovuswb
__m256i _mm512_mask_cvtusepi16_epi8 (__m256i src, __mmask32 k, __m512i a)
Synopsis
__m256i _mm512_mask_cvtusepi16_epi8 (__m256i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovuswb
__m256i _mm512_maskz_cvtusepi16_epi8 (__mmask32 k, __m512i a)
Synopsis
__m256i _mm512_maskz_cvtusepi16_epi8 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovuswb
void _mm_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 16*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovuswb
void _mm256_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)
Synopsis
void _mm256_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 15
i := 16*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovuswb
void _mm512_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)
Synopsis
void _mm512_mask_cvtusepi16_storeu_epi8 (void* base_addr, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovuswb
CPUID Flags: AVX512BW
Description
Convert packed unsigned 16-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 31
i := 16*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
FI
ENDFOR
dst[MAX:256] := 0
vpmovusdw
__m128i _mm_cvtusepi32_epi16 (__m128i a)
Synopsis
__m128i _mm_cvtusepi32_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 16*j
dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:64] := 0
vpmovusdw
__m128i _mm_mask_cvtusepi32_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtusepi32_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovusdw
__m128i _mm_maskz_cvtusepi32_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtusepi32_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovusdw
__m128i _mm256_cvtusepi32_epi16 (__m256i a)
Synopsis
__m128i _mm256_cvtusepi32_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 16*j
dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vpmovusdw
__m128i _mm256_mask_cvtusepi32_epi16 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtusepi32_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovusdw
__m128i _mm256_maskz_cvtusepi32_epi16 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtusepi32_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovusdw
__m256i _mm512_cvtusepi32_epi16 (__m512i a)
Synopsis
__m256i _mm512_cvtusepi32_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusdw ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 16*j
dst[k+15:k] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vpmovusdw
__m256i _mm512_mask_cvtusepi32_epi16 (__m256i src, __mmask16 k, __m512i a)
Synopsis
__m256i _mm512_mask_cvtusepi32_epi16 (__m256i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdw ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovusdw
__m256i _mm512_maskz_cvtusepi32_epi16 (__mmask16 k, __m512i a)
Synopsis
__m256i _mm512_maskz_cvtusepi32_epi16 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdw ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovusdb
__m128i _mm_cvtusepi32_epi8 (__m128i a)
Synopsis
__m128i _mm_cvtusepi32_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
k := 8*j
dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:32] := 0
vpmovusdb
__m128i _mm_mask_cvtusepi32_epi8 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtusepi32_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:32] := 0
vpmovusdb
__m128i _mm_maskz_cvtusepi32_epi8 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtusepi32_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:32] := 0
vpmovusdb
__m128i _mm256_cvtusepi32_epi8 (__m256i a)
Synopsis
__m128i _mm256_cvtusepi32_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
k := 8*j
dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:64] := 0
vpmovusdb
__m128i _mm256_mask_cvtusepi32_epi8 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtusepi32_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovusdb
__m128i _mm256_maskz_cvtusepi32_epi8 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtusepi32_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovusdb
__m128i _mm512_cvtusepi32_epi8 (__m512i a)
Synopsis
__m128i _mm512_cvtusepi32_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusdb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
k := 8*j
dst[k+7:k] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vpmovusdb
__m128i _mm512_mask_cvtusepi32_epi8 (__m128i src, __mmask16 k, __m512i a)
Synopsis
__m128i _mm512_mask_cvtusepi32_epi8 (__m128i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovusdb
__m128i _mm512_maskz_cvtusepi32_epi8 (__mmask16 k, __m512i a)
Synopsis
__m128i _mm512_maskz_cvtusepi32_epi8 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovusdw
void _mm_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 32*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovusdw
void _mm256_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 32*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovusdw
void _mm512_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)
Synopsis
void _mm512_mask_cvtusepi32_storeu_epi16 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdw m256 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 15
i := 32*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt32_To_Int16(a[i+31:i])
FI
ENDFOR
vpmovusdb
void _mm_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 32*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
FI
ENDFOR
dst[MAX:32] := 0
vpmovusdb
void _mm256_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusdb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 32*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovusdb
void _mm512_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)
Synopsis
void _mm512_mask_cvtusepi32_storeu_epi8 (void* base_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusdb m128 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 15
i := 32*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt32_To_Int8(a[i+31:i])
FI
ENDFOR
vpmovusqw
__m128i _mm_cvtusepi64_epi16 (__m128i a)
Synopsis
__m128i _mm_cvtusepi64_epi16 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 16*j
dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:32] := 0
vpmovusqw
__m128i _mm_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:32] := 0
vpmovusqw
__m128i _mm_maskz_cvtusepi64_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtusepi64_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:32] := 0
vpmovusqw
__m128i _mm256_cvtusepi64_epi16 (__m256i a)
Synopsis
__m128i _mm256_cvtusepi64_epi16 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 16*j
dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vpmovusqw
__m128i _mm256_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovusqw
__m128i _mm256_maskz_cvtusepi64_epi16 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtusepi64_epi16 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovusqw
__m128i _mm512_cvtusepi64_epi16 (__m512i a)
Synopsis
__m128i _mm512_cvtusepi64_epi16 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusqw xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 16*j
dst[k+15:k] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpmovusqw
__m128i _mm512_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m512i a)
Synopsis
__m128i _mm512_mask_cvtusepi64_epi16 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqw xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := src[l+15:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovusqw
__m128i _mm512_maskz_cvtusepi64_epi16 (__mmask8 k, __m512i a)
Synopsis
__m128i _mm512_maskz_cvtusepi64_epi16 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqw xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
dst[l+15:l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
ELSE
dst[l+15:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovusqd
__m128i _mm_cvtusepi64_epi32 (__m128i a)
Synopsis
__m128i _mm_cvtusepi64_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 32*j
dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vpmovusqd
__m128i _mm_mask_cvtusepi64_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtusepi64_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovusqd
__m128i _mm_maskz_cvtusepi64_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtusepi64_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovusqd
__m128i _mm256_cvtusepi64_epi32 (__m256i a)
Synopsis
__m128i _mm256_cvtusepi64_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 32*j
dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpmovusqd
__m128i _mm256_mask_cvtusepi64_epi32 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtusepi64_epi32 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:128] := 0
vpmovusqd
__m128i _mm256_maskz_cvtusepi64_epi32 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtusepi64_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovusqd
__m256i _mm512_cvtusepi64_epi32 (__m512i a)
Synopsis
__m256i _mm512_cvtusepi64_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusqd ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 32*j
dst[k+31:k] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vpmovusqd
__m256i _mm512_mask_cvtusepi64_epi32 (__m256i src, __mmask8 k, __m512i a)
Synopsis
__m256i _mm512_mask_cvtusepi64_epi32 (__m256i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqd ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := src[l+31:l]
FI
ENDFOR
dst[MAX:256] := 0
vpmovusqd
__m256i _mm512_maskz_cvtusepi64_epi32 (__mmask8 k, __m512i a)
Synopsis
__m256i _mm512_maskz_cvtusepi64_epi32 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqd ymm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
dst[l+31:l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
ELSE
dst[l+31:l] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovusqb
__m128i _mm_cvtusepi64_epi8 (__m128i a)
Synopsis
__m128i _mm_cvtusepi64_epi8 (__m128i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
k := 8*j
dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:16] := 0
vpmovusqb
__m128i _mm_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:16] := 0
vpmovusqb
__m128i _mm_maskz_cvtusepi64_epi8 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_cvtusepi64_epi8 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:16] := 0
vpmovusqb
__m128i _mm256_cvtusepi64_epi8 (__m256i a)
Synopsis
__m128i _mm256_cvtusepi64_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
k := 8*j
dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:32] := 0
vpmovusqb
__m128i _mm256_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m256i a)
Synopsis
__m128i _mm256_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:32] := 0
vpmovusqb
__m128i _mm256_maskz_cvtusepi64_epi8 (__mmask8 k, __m256i a)
Synopsis
__m128i _mm256_maskz_cvtusepi64_epi8 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:32] := 0
vpmovusqb
__m128i _mm512_cvtusepi64_epi8 (__m512i a)
Synopsis
__m128i _mm512_cvtusepi64_epi8 (__m512i a)
#include "immintrin.h"
Instruction: vpmovusqb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
k := 8*j
dst[k+7:k] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ENDFOR
dst[MAX:64] := 0
vpmovusqb
__m128i _mm512_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m512i a)
Synopsis
__m128i _mm512_mask_cvtusepi64_epi8 (__m128i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := src[l+7:l]
FI
ENDFOR
dst[MAX:64] := 0
vpmovusqb
__m128i _mm512_maskz_cvtusepi64_epi8 (__mmask8 k, __m512i a)
Synopsis
__m128i _mm512_maskz_cvtusepi64_epi8 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqb xmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
dst[l+7:l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
ELSE
dst[l+7:l] := 0
FI
ENDFOR
dst[MAX:64] := 0
vpmovusqw
void _mm_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 1
i := 64*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
FI
ENDFOR
dst[MAX:32] := 0
vpmovusqw
void _mm256_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqw
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 64*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovusqw
void _mm512_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_cvtusepi64_storeu_epi16 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqw m128 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 64*j
l := 16*j
IF k[j]
MEM[base_addr+l+15:base_addr+l] := Saturate_UnsignedInt64_To_Int16(a[i+63:i])
FI
ENDFOR
vpmovusqd
void _mm_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 1
i := 64*j
l := 32*j
IF k[j]
MEM[base_addr+l+31:base_addr+l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
FI
ENDFOR
dst[MAX:64] := 0
vpmovusqd
void _mm256_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqd
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 64*j
l := 32*j
IF k[j]
MEM[base_addr+l+31:base_addr+l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
FI
ENDFOR
dst[MAX:128] := 0
vpmovusqd
void _mm512_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_cvtusepi64_storeu_epi32 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqd m256 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 64*j
l := 32*j
IF k[j]
MEM[base_addr+l+31:base_addr+l] := Saturate_UnsignedInt64_To_Int32(a[i+63:i])
FI
ENDFOR
vpmovusqb
void _mm_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 1
i := 64*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
FI
ENDFOR
dst[MAX:16] := 0
vpmovusqb
void _mm256_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpmovusqb
CPUID Flags: AVX512VL + AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 3
i := 64*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
FI
ENDFOR
dst[MAX:32] := 0
vpmovusqb
void _mm512_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_cvtusepi64_storeu_epi8 (void* base_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpmovusqb m64 {k}, zmm
CPUID Flags: AVX512F
Description
Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
Operation
FOR j := 0 to 7
i := 64*j
l := 8*j
IF k[j]
MEM[base_addr+l+7:base_addr+l] := Saturate_UnsignedInt64_To_Int8(a[i+63:i])
FI
ENDFOR
vdbpsadbw
__m128i _mm_dbsad_epu8 (__m128i a, __m128i b, int imm8)
Synopsis
__m128i _mm_dbsad_epu8 (__m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst.
Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
Operation
tmp[31:0] := select(b[127:0], imm8[1:0])
tmp[63:32] := select(b[127:0], imm8[3:2])
tmp[95:64] := select(b[127:0], imm8[5:4])
tmp[127:96] := select(b[127:0], imm8[7:6])
FOR j := 0 to 1
i := j*64
dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+ ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+ ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+ ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+ ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
dst[MAX:128] := 0
vdbpsadbw
__m128i _mm_mask_dbsad_epu8 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)
Synopsis
__m128i _mm_mask_dbsad_epu8 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
Operation
tmp[31:0] := select(b[127:0], imm8[1:0])
tmp[63:32] := select(b[127:0], imm8[3:2])
tmp[95:64] := select(b[127:0], imm8[5:4])
tmp[127:96] := select(b[127:0], imm8[7:6])
FOR j := 0 to 1
i := j*64
tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+ ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+ ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+ ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+ ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vdbpsadbw
__m128i _mm_maskz_dbsad_epu8 (__mmask8 k, __m128i a, __m128i b, int imm8)
Synopsis
__m128i _mm_maskz_dbsad_epu8 (__mmask8 k, __m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
Operation
tmp[31:0] := select(b[127:0], imm8[1:0])
tmp[63:32] := select(b[127:0], imm8[3:2])
tmp[95:64] := select(b[127:0], imm8[5:4])
tmp[127:96] := select(b[127:0], imm8[7:6])
FOR j := 0 to 1
i := j*64
tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+ ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+ ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+ ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+ ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vdbpsadbw
__m256i _mm256_dbsad_epu8 (__m256i a, __m256i b, int imm8)
Synopsis
__m256i _mm256_dbsad_epu8 (__m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst.
Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
Operation
FOR j := 0 to 1
i := j*128
tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR
FOR j := 0 to 3
i := j*64
dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+ ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+ ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+ ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+ ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
dst[MAX:256] := 0
vdbpsadbw
__m256i _mm256_mask_dbsad_epu8 (__m256i src, __mmask16 k, __m256i a, __m256i b, int imm8)
Synopsis
__m256i _mm256_mask_dbsad_epu8 (__m256i src, __mmask16 k, __m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
Operation
FOR j := 0 to 1
i := j*128
tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR
FOR j := 0 to 3
i := j*64
tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+ ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+ ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+ ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+ ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vdbpsadbw
__m256i _mm256_maskz_dbsad_epu8 (__mmask16 k, __m256i a, __m256i b, int imm8)
Synopsis
__m256i _mm256_maskz_dbsad_epu8 (__mmask16 k, __m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
Operation
FOR j := 0 to 1
i := j*128
tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR
FOR j := 0 to 3
i := j*64
tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+ ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+ ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+ ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+ ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vdbpsadbw
__m512i _mm512_dbsad_epu8 (__m512i a, __m512i b, int imm8)
Synopsis
__m512i _mm512_dbsad_epu8 (__m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512BW
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst.
Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
Operation
FOR j := 0 to 3
i := j*128
tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR
FOR j := 0 to 7
i := j*64
dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+ ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+ ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+ ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+ ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
dst[MAX:512] := 0
vdbpsadbw
__m512i _mm512_mask_dbsad_epu8 (__m512i src, __mmask32 k, __m512i a, __m512i b, int imm8)
Synopsis
__m512i _mm512_mask_dbsad_epu8 (__m512i src, __mmask32 k, __m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512BW
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
Operation
FOR j := 0 to 3
i := j*128
tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR
FOR j := 0 to 7
i := j*64
tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+ ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+ ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+ ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+ ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vdbpsadbw
__m512i _mm512_maskz_dbsad_epu8 (__mmask32 k, __m512i a, __m512i b, int imm8)
Synopsis
__m512i _mm512_maskz_dbsad_epu8 (__mmask32 k, __m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vdbpsadbw
CPUID Flags: AVX512BW
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The first two SADs use the lower 8-bit quadruplet of the lane from a, and the last two SADs use the uppper 8-bit quadruplet of the lane from a. Quadruplets from b are selected from within 128-bit lanes according to the control in imm8, and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.
Operation
FOR j := 0 to 3
i := j*128
tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR
FOR j := 0 to 7
i := j*64
tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
+ ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])
tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
+ ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])
tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
+ ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])
tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
+ ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
delay
void _mm_delay_32 (unsigned int r1)
Synopsis
void _mm_delay_32 (unsigned int r1)
#include "immintrin.h"
Instruction: delay r32
CPUID Flags: KNCNI
Description
Stalls a thread without blocking other threads for 32-bit unsigned integer r1 clock cycles.
Operation
BlockThread(r1)
delay
void _mm_delay_64 (unsigned __int64 r1)
Synopsis
void _mm_delay_64 (unsigned __int64 r1)
#include "immintrin.h"
Instruction: delay r64
CPUID Flags: KNCNI
Description
Stalls a thread without blocking other threads for 64-bit unsigned integer r1 clock cycles.
Operation
BlockThread(r1)
...
__m128i _mm_div_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_div_epi16 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 16-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 7
i := 16*j
dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_div_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_div_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 16-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 15
i := 16*j
dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_div_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_div_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 16-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 31
i := 16*j
dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_div_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_div_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_div_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_div_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_div_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_div_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_mask_div_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_div_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_div_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_div_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 64-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 1
i := 64*j
dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_div_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_div_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 64-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 3
i := 64*j
dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_div_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_div_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 64-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 7
i := 64*j
dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_div_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_div_epi8 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 8-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 15
i := 8*j
dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_div_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_div_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 8-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 31
i := 8*j
dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_div_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_div_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 8-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 63
i := 8*j
dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_div_epu16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_div_epu16 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 16-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 7
i := 16*j
dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_div_epu16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_div_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 16-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 15
i := 16*j
dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_div_epu16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_div_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 16-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 31
i := 16*j
dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_div_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_div_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_div_epu32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_div_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_div_epu32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_div_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_mask_div_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_div_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_div_epu64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_div_epu64 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 64-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 1
i := 64*j
dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_div_epu64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_div_epu64 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 64-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 3
i := 64*j
dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_div_epu64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_div_epu64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 64-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 7
i := 64*j
dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_div_epu8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_div_epu8 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 8-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 15
i := 8*j
dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_div_epu8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_div_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 8-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 31
i := 8*j
dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_div_epu8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_div_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 8-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 63
i := 8*j
dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0
divpd
__m128d _mm_div_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_div_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: divpd xmm, xmm
CPUID Flags: SSE2
Description
Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 1
i := 64*j
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ENDFOR
Performance
vdivpd
__m128d _mm_mask_div_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_div_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vdivpd
CPUID Flags: AVX512VL + AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
IF k[j]
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vdivpd
__m128d _mm_maskz_div_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_div_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vdivpd
CPUID Flags: AVX512VL + AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := 64*j
IF k[j]
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vdivpd
__m256d _mm256_div_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_div_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vdivpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 3
i := 64*j
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vdivpd
__m256d _mm256_mask_div_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_div_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vdivpd
CPUID Flags: AVX512VL + AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
IF k[j]
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vdivpd
__m256d _mm256_maskz_div_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_div_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vdivpd
CPUID Flags: AVX512VL + AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 64*j
IF k[j]
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vdivpd
__m512d _mm512_div_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_div_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 7
i := 64*j
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vdivpd
__m512d _mm512_mask_div_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_div_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
IF k[j]
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vdivpd
__m512d _mm512_maskz_div_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_div_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 64*j
IF k[j]
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
divps
__m128 _mm_div_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_div_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: divps xmm, xmm
CPUID Flags: SSE
Description
Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR
Performance
vdivps
__m128 _mm_mask_div_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_div_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vdivps
CPUID Flags: AVX512VL + AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
IF k[j]
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vdivps
__m128 _mm_maskz_div_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_div_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vdivps
CPUID Flags: AVX512VL + AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := 32*j
IF k[j]
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vdivps
__m256 _mm256_div_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_div_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vdivps ymm, ymm, ymm
CPUID Flags: AVX
Description
Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vdivps
__m256 _mm256_mask_div_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_div_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vdivps
CPUID Flags: AVX512VL + AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
IF k[j]
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vdivps
__m256 _mm256_maskz_div_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_div_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vdivps
CPUID Flags: AVX512VL + AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := 32*j
IF k[j]
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vdivps
__m512 _mm512_div_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_div_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vdivps
__m512 _mm512_mask_div_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_div_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vdivps
__m512 _mm512_maskz_div_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_div_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vdivpd
__m512d _mm512_div_round_pd (__m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_div_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in
a by packed elements in
b, =and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 64*j
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vdivpd
__m512d _mm512_mask_div_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_mask_div_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in
a by packed elements in
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 64*j
IF k[j]
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vdivpd
__m512d _mm512_maskz_div_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_maskz_div_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vdivpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Divide packed double-precision (64-bit) floating-point elements in
a by packed elements in
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := 64*j
IF k[j]
dst[i+63:i] := a[i+63:i] / b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vdivps
__m512 _mm512_div_round_ps (__m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_div_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in
a by packed elements in
b, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vdivps
__m512 _mm512_mask_div_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_mask_div_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in
a by packed elements in
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vdivps
__m512 _mm512_maskz_div_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_maskz_div_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vdivps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Divide packed single-precision (32-bit) floating-point elements in
a by packed elements in
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := a[i+31:i] / b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vdivsd
__m128d _mm_div_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_div_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Divide the lower double-precision (64-bit) floating-point element in
a by the lower double-precision (64-bit) floating-point element in
b, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := a[63:0] / b[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vdivsd
__m128d _mm_mask_div_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_div_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Divide the lower double-precision (64-bit) floating-point element in
a by the lower double-precision (64-bit) floating-point element in
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] / b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vdivsd
__m128d _mm_maskz_div_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_div_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Divide the lower double-precision (64-bit) floating-point element in
a by the lower double-precision (64-bit) floating-point element in
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] / b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vdivss
__m128 _mm_div_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_div_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Divide the lower single-precision (32-bit) floating-point element in
a by the lower single-precision (32-bit) floating-point element in
b, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := a[31:0] / b[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vdivss
__m128 _mm_mask_div_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_div_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Divide the lower single-precision (32-bit) floating-point element in
a by the lower single-precision (32-bit) floating-point element in
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] / b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vdivss
__m128 _mm_maskz_div_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_div_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Divide the lower single-precision (32-bit) floating-point element in
a by the lower single-precision (32-bit) floating-point element in
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] / b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
divsd
__m128d _mm_div_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_div_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: divsd xmm, xmm
CPUID Flags: SSE2
Description
Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := a[63:0] 0 b[63:0]
dst[127:64] := a[127:64]
Performance
vdivsd
__m128d _mm_mask_div_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_div_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] / b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vdivsd
__m128d _mm_maskz_div_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_div_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vdivsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] / b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
divss
__m128 _mm_div_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_div_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: divss xmm, xmm
CPUID Flags: SSE
Description
Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := a[31:0] / b[31:0]
dst[127:32] := a[127:32]
Performance
vdivss
__m128 _mm_mask_div_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_div_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] / b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vdivss
__m128 _mm_maskz_div_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_div_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vdivss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] / b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
dppd
__m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8)
Synopsis
__m128d _mm_dp_pd (__m128d a, __m128d b, const int imm8)
#include "smmintrin.h"
Instruction: dppd xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Conditionally multiply the packed double-precision (64-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.
Operation
DP(a[127:0], b[127:0], imm8[7:0]) {
FOR j := 0 to 1
i := j*64
IF imm8[(4+j)%8]]
temp[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
temp[i+63:i] := 0
FI
ENDFOR
sum[63:0] := temp[127:64] + temp[63:0]
FOR j := 0 to 1
i := j*64
IF imm8[j%8]
tmpdst[i+63:i] := sum[63:0]
ELSE
tmpdst[i+63:i] := 0
FI
ENDFOR
RETURN tmpdst[127:0]
}
dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
Performance
dpps
__m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_dp_ps (__m128 a, __m128 b, const int imm8)
#include "smmintrin.h"
Instruction: dpps xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.
Operation
DP(a[127:0], b[127:0], imm8[7:0]) {
FOR j := 0 to 3
i := j*32
IF imm8[(4+j)%8]
temp[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
temp[i+31:i] := 0
FI
ENDFOR
sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
FOR j := 0 to 3
i := j*32
IF imm8[j%8]
tmpdst[i+31:i] := sum[31:0]
ELSE
tmpdst[i+31:i] := 0
FI
ENDFOR
RETURN tmpdst[127:0]
}
dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
Performance
vdpps
__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)
Synopsis
__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vdpps ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.
Operation
DP(a[127:0], b[127:0], imm8[7:0]) {
FOR j := 0 to 3
i := j*32
IF imm8[(4+j)%8]
temp[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
temp[i+31:i] := 0
FI
ENDFOR
sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
FOR j := 0 to 3
i := j*32
IF imm8[j%8]
tmpdst[i+31:i] := sum[31:0]
ELSE
tmpdst[i+31:i] := 0
FI
ENDFOR
RETURN tmpdst[127:0]
}
dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0])
dst[MAX:256] := 0
Performance
...
__m128d _mm_erf_pd (__m128d a)
Synopsis
__m128d _mm_erf_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_erf_pd (__m256d a)
Synopsis
__m256d _mm256_erf_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ERF(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_erf_pd (__m512d a)
Synopsis
__m512d _mm512_erf_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ERF(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_erf_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_erf_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ERF(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_erf_ps (__m128 a)
Synopsis
__m128 _mm_erf_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_erf_ps (__m256 a)
Synopsis
__m256 _mm256_erf_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ERF(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_erf_ps (__m512 a)
Synopsis
__m512 _mm512_erf_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ERF(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_erf_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_erf_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ERF(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_erfc_pd (__m128d a)
Synopsis
__m128d _mm_erfc_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := 1.0 - ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_erfc_pd (__m256d a)
Synopsis
__m256d _mm256_erfc_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := 1.0 - ERF(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_erfc_pd (__m512d a)
Synopsis
__m512d _mm512_erfc_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := 1.0 - ERF(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_erfc_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_erfc_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := 1.0 - ERF(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_erfc_ps (__m128 a)
Synopsis
__m128 _mm_erfc_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := 1.0 - ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_erfc_ps (__m256 a)
Synopsis
__m256 _mm256_erfc_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := 1.0 - ERF(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_erfc_ps (__m512 a)
Synopsis
__m512 _mm512_erfc_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := 1.0 - ERF(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_erfc_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_erfc_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := 1.0 - ERF(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_erfcinv_pd (__m128d a)
Synopsis
__m128d _mm_erfcinv_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_erfcinv_pd (__m256d a)
Synopsis
__m256d _mm256_erfcinv_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_erfcinv_pd (__m512d a)
Synopsis
__m512d _mm512_erfcinv_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_erfcinv_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_erfcinv_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_erfcinv_ps (__m128 a)
Synopsis
__m128 _mm_erfcinv_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_erfcinv_ps (__m256 a)
Synopsis
__m256 _mm256_erfcinv_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_erfcinv_ps (__m512 a)
Synopsis
__m512 _mm512_erfcinv_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_erfcinv_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_erfcinv_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_erfinv_pd (__m128d a)
Synopsis
__m128d _mm_erfinv_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := 1.0 / ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_erfinv_pd (__m256d a)
Synopsis
__m256d _mm256_erfinv_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := 1.0 / ERF(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_erfinv_pd (__m512d a)
Synopsis
__m512d _mm512_erfinv_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := 1.0 / ERF(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_erfinv_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_erfinv_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse error function of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := 1.0 / ERF(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_erfinv_ps (__m128 a)
Synopsis
__m128 _mm_erfinv_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := 1.0 / ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_erfinv_ps (__m256 a)
Synopsis
__m256 _mm256_erfinv_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := 1.0 / ERF(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_erfinv_ps (__m512 a)
Synopsis
__m512 _mm512_erfinv_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := 1.0 / ERF(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_erfinv_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_erfinv_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse error function of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := 1.0 / ERF(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_exp_pd (__m128d a)
Synopsis
__m128d _mm_exp_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := e^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_exp_pd (__m256d a)
Synopsis
__m256d _mm256_exp_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := e^(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_exp_pd (__m512d a)
Synopsis
__m512d _mm512_exp_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := e^(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_exp_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_exp_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := e^(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_exp_ps (__m128 a)
Synopsis
__m128 _mm_exp_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_exp_ps (__m256 a)
Synopsis
__m256 _mm256_exp_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_exp_ps (__m512 a)
Synopsis
__m512 _mm512_exp_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_exp_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_exp_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := e^(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_exp10_pd (__m128d a)
Synopsis
__m128d _mm_exp10_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := 10^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_exp10_pd (__m256d a)
Synopsis
__m256d _mm256_exp10_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := 10^(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_exp10_pd (__m512d a)
Synopsis
__m512d _mm512_exp10_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := 10^(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_exp10_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_exp10_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := 10^(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_exp10_ps (__m128 a)
Synopsis
__m128 _mm_exp10_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := 10^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_exp10_ps (__m256 a)
Synopsis
__m256 _mm256_exp10_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := 10^(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_exp10_ps (__m512 a)
Synopsis
__m512 _mm512_exp10_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := 10^(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_exp10_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_exp10_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := 10^(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_exp2_pd (__m128d a)
Synopsis
__m128d _mm_exp2_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := 2^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_exp2_pd (__m256d a)
Synopsis
__m256d _mm256_exp2_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := 2^(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_exp2_pd (__m512d a)
Synopsis
__m512d _mm512_exp2_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := 2^(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_exp2_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_exp2_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := 2^(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_exp2_ps (__m128 a)
Synopsis
__m128 _mm_exp2_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := 2^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_exp2_ps (__m256 a)
Synopsis
__m256 _mm256_exp2_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := 2^(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_exp2_ps (__m512 a)
Synopsis
__m512 _mm512_exp2_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := 2^(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_exp2_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_exp2_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := 2^(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vexp223ps
__m512 _mm512_exp223_ps (__m512i v2)
Synopsis
__m512 _mm512_exp223_ps (__m512i v2)
#include "immintrin.h"
Instruction: vexp223ps zmm {k}, zmm
CPUID Flags: KNCNI
Description
Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in v2 with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := exp223(v2[i+31:i])
ENDFOR
dst[MAX:512] := 0
vexp223ps
__m512 _mm512_mask_exp223_ps (__m512 src, __mmask16 k, __m512i v2)
Synopsis
__m512 _mm512_mask_exp223_ps (__m512 src, __mmask16 k, __m512i v2)
#include "immintrin.h"
Instruction: vexp223ps zmm {k}, zmm
CPUID Flags: KNCNI
Description
Approximates the base-2 exponent of the packed single-precision (32-bit) floating-point elements in v2 with eight bits for sign and magnitude and 24 bits for the fractional part. Results are stored in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := exp223(v2[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vexp2pd
__m512d _mm512_exp2a23_pd (__m512d a)
Synopsis
__m512d _mm512_exp2a23_pd (__m512d a)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-23.
Operation
FOR j := 0 to 7
i := j*64;
dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
ENDFOR;
dst[MAX:512] := 0
vexp2pd
__m512d _mm512_mask_exp2a23_pd (__m512d a, __mmask8 k, __m512d src)
Synopsis
__m512d _mm512_mask_exp2a23_pd (__m512d a, __mmask8 k, __m512d src)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
ELSE
dst[i+63:i] := src[i+63:i];
FI
ENDFOR;
dst[MAX:512] := 0
vexp2pd
__m512d _mm512_maskz_exp2a23_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_exp2a23_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
ELSE
dst[i+63:i] := 0;
FI
ENDFOR;
dst[MAX:512] := 0
vexp2ps
__m512 _mm512_exp2a23_ps (__m512 a)
Synopsis
__m512 _mm512_exp2a23_ps (__m512 a)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-23.
Operation
FOR j := 0 to 15
i := j*32;
dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
ENDFOR;
dst[MAX:512] := 0
vexp2ps
__m512 _mm512_mask_exp2a23_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_exp2a23_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
ELSE
dst[i*31:i] := src[i*31:i];
FI
ENDFOR;
dst[MAX:512] := 0
vexp2ps
__m512 _mm512_maskz_exp2a23_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_exp2a23_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
ELSE
dst[i*31:i] := 0;
FI
ENDFOR;
dst[MAX:512] := 0
vexp2pd
__m512d _mm512_exp2a23_round_pd (__m512d a, int rounding)
Synopsis
__m512d _mm512_exp2a23_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst. The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64;
dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
ENDFOR;
dst[MAX:512] := 0
vexp2pd
__m512d _mm512_mask_exp2a23_round_pd (__m512d a, __mmask8 k, __m512d src, int rounding)
Synopsis
__m512d _mm512_mask_exp2a23_round_pd (__m512d a, __mmask8 k, __m512d src, int rounding)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
ELSE
dst[i+63:i] := src[i+63:i];
FI
ENDFOR;
dst[MAX:512] := 0
vexp2pd
__m512d _mm512_maskz_exp2a23_round_pd (__mmask8 k, __m512d a, int rounding)
Synopsis
__m512d _mm512_maskz_exp2a23_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vexp2pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := EXP_2_23_DP(a[i+63:i]);
ELSE
dst[i+63:i] := 0;
FI
ENDFOR;
dst[MAX:512] := 0
vexp2ps
__m512 _mm512_exp2a23_round_ps (__m512 a, int rounding)
Synopsis
__m512 _mm512_exp2a23_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst. The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32;
dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
ENDFOR;
dst[MAX:512] := 0
vexp2ps
__m512 _mm512_mask_exp2a23_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_mask_exp2a23_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
ELSE
dst[i*31:i] := src[i*31:i];
FI
ENDFOR;
dst[MAX:512] := 0
vexp2ps
__m512 _mm512_maskz_exp2a23_round_ps (__mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_maskz_exp2a23_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vexp2ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-23.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := EXP_2_23_SP(a[i+31:i]);
ELSE
dst[i*31:i] := 0;
FI
ENDFOR;
dst[MAX:512] := 0
vpexpandd
__m128i _mm_mask_expand_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_expand_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpexpandd
__m128i _mm_maskz_expand_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_expand_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpexpandd
__m256i _mm256_mask_expand_epi32 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_expand_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpexpandd
__m256i _mm256_maskz_expand_epi32 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_expand_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpexpandd
__m512i _mm512_mask_expand_epi32 (__m512i src, __mmask16 k, __m512i a)
Synopsis
__m512i _mm512_mask_expand_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpexpandd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpexpandd
__m512i _mm512_maskz_expand_epi32 (__mmask16 k, __m512i a)
Synopsis
__m512i _mm512_maskz_expand_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vpexpandd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpexpandq
__m128i _mm_mask_expand_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_expand_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpexpandq
__m128i _mm_maskz_expand_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_expand_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpexpandq
__m256i _mm256_mask_expand_epi64 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_expand_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpexpandq
__m256i _mm256_maskz_expand_epi64 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_expand_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpexpandq
__m512i _mm512_mask_expand_epi64 (__m512i src, __mmask8 k, __m512i a)
Synopsis
__m512i _mm512_mask_expand_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpexpandq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpexpandq
__m512i _mm512_maskz_expand_epi64 (__mmask8 k, __m512i a)
Synopsis
__m512i _mm512_maskz_expand_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vpexpandq zmm {k}, zmm
CPUID Flags: AVX512F
Description
Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vexpandpd
__m128d _mm_mask_expand_pd (__m128d src, __mmask8 k, __m128d a)
Synopsis
__m128d _mm_mask_expand_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vexpandpd
__m128d _mm_maskz_expand_pd (__mmask8 k, __m128d a)
Synopsis
__m128d _mm_maskz_expand_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vexpandpd
__m256d _mm256_mask_expand_pd (__m256d src, __mmask8 k, __m256d a)
Synopsis
__m256d _mm256_mask_expand_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vexpandpd
__m256d _mm256_maskz_expand_pd (__mmask8 k, __m256d a)
Synopsis
__m256d _mm256_maskz_expand_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vexpandpd
__m512d _mm512_mask_expand_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_expand_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vexpandpd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vexpandpd
__m512d _mm512_maskz_expand_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_expand_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vexpandpd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[m+63:m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vexpandps
__m128 _mm_mask_expand_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_expand_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vexpandps
__m128 _mm_maskz_expand_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_expand_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vexpandps
__m256 _mm256_mask_expand_ps (__m256 src, __mmask8 k, __m256 a)
Synopsis
__m256 _mm256_mask_expand_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vexpandps
__m256 _mm256_maskz_expand_ps (__mmask8 k, __m256 a)
Synopsis
__m256 _mm256_maskz_expand_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vexpandps
__m512 _mm512_mask_expand_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_expand_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vexpandps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vexpandps
__m512 _mm512_maskz_expand_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_expand_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vexpandps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[m+31:m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpexpandd
__m128i _mm_mask_expandloadu_epi32 (__m128i src, __mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_mask_expandloadu_epi32 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpexpandd
__m128i _mm_maskz_expandloadu_epi32 (__mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_maskz_expandloadu_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpexpandd
__m256i _mm256_mask_expandloadu_epi32 (__m256i src, __mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_mask_expandloadu_epi32 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpexpandd
__m256i _mm256_maskz_expandloadu_epi32 (__mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_maskz_expandloadu_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpexpandd
__m512i _mm512_mask_expandloadu_epi32 (__m512i src, __mmask16 k, void const* mem_addr)
Synopsis
__m512i _mm512_mask_expandloadu_epi32 (__m512i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd zmm {k}, m32
CPUID Flags: AVX512F
Description
Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpexpandd
__m512i _mm512_maskz_expandloadu_epi32 (__mmask16 k, void const* mem_addr)
Synopsis
__m512i _mm512_maskz_expandloadu_epi32 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandd zmm {k}, m32
CPUID Flags: AVX512F
Description
Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpexpandq
__m128i _mm_mask_expandloadu_epi64 (__m128i src, __mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_mask_expandloadu_epi64 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpexpandq
__m128i _mm_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpexpandq
__m256i _mm256_mask_expandloadu_epi64 (__m256i src, __mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_mask_expandloadu_epi64 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpexpandq
__m256i _mm256_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpexpandq
__m512i _mm512_mask_expandloadu_epi64 (__m512i src, __mmask8 k, void const* mem_addr)
Synopsis
__m512i _mm512_mask_expandloadu_epi64 (__m512i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq zmm {k}, m64
CPUID Flags: AVX512F
Description
Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpexpandq
__m512i _mm512_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)
Synopsis
__m512i _mm512_maskz_expandloadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vpexpandq zmm {k}, m64
CPUID Flags: AVX512F
Description
Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vexpandpd
__m128d _mm_mask_expandloadu_pd (__m128d src, __mmask8 k, void const* mem_addr)
Synopsis
__m128d _mm_mask_expandloadu_pd (__m128d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vexpandpd
__m128d _mm_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)
Synopsis
__m128d _mm_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vexpandpd
__m256d _mm256_mask_expandloadu_pd (__m256d src, __mmask8 k, void const* mem_addr)
Synopsis
__m256d _mm256_mask_expandloadu_pd (__m256d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vexpandpd
__m256d _mm256_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)
Synopsis
__m256d _mm256_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vexpandpd
__m512d _mm512_mask_expandloadu_pd (__m512d src, __mmask8 k, void const* mem_addr)
Synopsis
__m512d _mm512_mask_expandloadu_pd (__m512d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd zmm {k}, m512
CPUID Flags: AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vexpandpd
__m512d _mm512_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)
Synopsis
__m512d _mm512_maskz_expandloadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandpd zmm {k}, m512
CPUID Flags: AVX512F
Description
Load contiguous active double-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+m+63:mem_addr+m]
m := m + 64
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vexpandps
__m128 _mm_mask_expandloadu_ps (__m128 src, __mmask8 k, void const* mem_addr)
Synopsis
__m128 _mm_mask_expandloadu_ps (__m128 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vexpandps
__m128 _mm_maskz_expandloadu_ps (__mmask8 k, void const* mem_addr)
Synopsis
__m128 _mm_maskz_expandloadu_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vexpandps
__m256 _mm256_mask_expandloadu_ps (__m256 src, __mmask8 k, void const* mem_addr)
Synopsis
__m256 _mm256_mask_expandloadu_ps (__m256 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vexpandps
__m256 _mm256_maskz_expandloadu_ps (__mmask8 k, void const* mem_addr)
Synopsis
__m256 _mm256_maskz_expandloadu_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps
CPUID Flags: AVX512VL + AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vexpandps
__m512 _mm512_mask_expandloadu_ps (__m512 src, __mmask16 k, void const* mem_addr)
Synopsis
__m512 _mm512_mask_expandloadu_ps (__m512 src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps zmm {k}, m512
CPUID Flags: AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vexpandps
__m512 _mm512_maskz_expandloadu_ps (__mmask16 k, void const* mem_addr)
Synopsis
__m512 _mm512_maskz_expandloadu_ps (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vexpandps zmm {k}, m512
CPUID Flags: AVX512F
Description
Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
m := 0
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+m+31:mem_addr+m]
m := m + 32
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_expm1_pd (__m128d a)
Synopsis
__m128d _mm_expm1_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, subtract one from each element, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := e^(a[i+63:i]) - 1.0
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_expm1_pd (__m256d a)
Synopsis
__m256d _mm256_expm1_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, subtract one from each element, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := e^(a[i+63:i]) - 1.0
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_expm1_pd (__m512d a)
Synopsis
__m512d _mm512_expm1_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, subtract one from each element, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := e^(a[i+63:i]) - 1.0
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_expm1_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_expm1_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of e raised to the power of packed double-precision (64-bit) floating-point elements in a, subtract one from each element, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := e^(a[i+63:i]) - 1.0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_expm1_ps (__m128 a)
Synopsis
__m128 _mm_expm1_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, subtract one from each element, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := e^(a[i+31:i]) - 1.0
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_expm1_ps (__m256 a)
Synopsis
__m256 _mm256_expm1_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, subtract one from each element, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := e^(a[i+31:i]) - 1.0
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_expm1_ps (__m512 a)
Synopsis
__m512 _mm512_expm1_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, subtract one from each element, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := e^(a[i+31:i]) - 1.0
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_expm1_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_expm1_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of e raised to the power of packed single-precision (32-bit) floating-point elements in a, subtract one from each element, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := e^(a[i+31:i]) - 1.0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqa32, vbroadcasti32x4, vpbroadcastd
__m512i _mm512_extload_epi32 (void const * mt, _MM_UPCONV_EPI32_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
Synopsis
__m512i _mm512_extload_epi32 (void const * mt, _MM_UPCONV_EPI32_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
vbroadcasti32x4 zmm {k}, m512
vpbroadcastd zmm {k}, m512
CPUID Flags: KNCNI
Description
Depending on bc, loads 1, 4, or 16 elements of type and size determined by conv from memory address mt and converts all elements to 32-bit integer elements, storing the results in dst. hint indicates to the processor whether the data is non-temporal.
Operation
addr = MEM[mt]
FOR j := 0 to 15
i := j*32
CASE bc OF
_MM_BROADCAST32_NONE:
CASE conv OF
_MM_UPCONV_EPI32_NONE:
n := j*32
dst[i+31:i] := addr[n+31:n]
_MM_UPCONV_EPI32_UINT8:
n := j*8
dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_SINT8:
n := j*8
dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_UINT16:
n := j*16
dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
_MM_UPCONV_EPI32_SINT16:
n := j*16
dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
ESAC
_MM_BROADCAST_1X16:
CASE conv OF
_MM_UPCONV_EPI32_NONE:
n := j*32
dst[i+31:i] := addr[31:0]
_MM_UPCONV_EPI32_UINT8:
n := j*8
dst[i+31:i] := UInt8ToInt32(addr[7:0])
_MM_UPCONV_EPI32_SINT8:
n := j*8
dst[i+31:i] := SInt8ToInt32(addr[7:0])
_MM_UPCONV_EPI32_UINT16:
n := j*16
dst[i+31:i] := UInt16ToInt32(addr[15:0])
_MM_UPCONV_EPI32_SINT16:
n := j*16
dst[i+31:i] := SInt16ToInt32(addr[15:0])
ESAC
_MM_BROADCAST_4X16:
mod := j%4
CASE conv OF
_MM_UPCONV_EPI32_NONE:
n := mod*32
dst[i+31:i] := addr[n+31:n]
_MM_UPCONV_EPI32_UINT8:
n := mod*8
dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_SINT8:
n := mod*8
dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_UINT16:
n := mod*16
dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
_MM_UPCONV_EPI32_SINT16:
n := mod*16
dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
ESAC
ESAC
ENDFOR
dst[MAX:512] := 0
vmovdqa32, vbroadcasti32x4, vpbroadcastd
__m512i _mm512_mask_extload_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
Synopsis
__m512i _mm512_mask_extload_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
vbroadcasti32x4 zmm {k}, m512
vpbroadcastd zmm {k}, m512
CPUID Flags: KNCNI
Description
Depending on bc, loads 1, 4, or 16 elements of type and size determined by conv from memory address mt and converts all elements to 32-bit integer elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
addr = MEM[mt]
FOR j := 0 to 15
i := j*32
IF k[j]
CASE bc OF
_MM_BROADCAST32_NONE:
CASE conv OF
_MM_UPCONV_EPI32_NONE:
n := j*32
dst[i+31:i] := addr[n+31:n]
_MM_UPCONV_EPI32_UINT8:
n := j*8
dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_SINT8:
n := j*8
dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_UINT16:
n := j*16
dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
_MM_UPCONV_EPI32_SINT16:
n := j*16
dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
ESAC
_MM_BROADCAST_1X16:
CASE conv OF
_MM_UPCONV_EPI32_NONE:
n := j*32
dst[i+31:i] := addr[31:0]
_MM_UPCONV_EPI32_UINT8:
n := j*8
dst[i+31:i] := UInt8ToInt32(addr[7:0])
_MM_UPCONV_EPI32_SINT8:
n := j*8
dst[i+31:i] := SInt8ToInt32(addr[7:0])
_MM_UPCONV_EPI32_UINT16:
n := j*16
dst[i+31:i] := UInt16ToInt32(addr[15:0])
_MM_UPCONV_EPI32_SINT16:
n := j*16
dst[i+31:i] := SInt16ToInt32(addr[15:0])
ESAC
_MM_BROADCAST_4X16:
mod := j%4
CASE conv OF
_MM_UPCONV_EPI32_NONE:
n := mod*32
dst[i+31:i] := addr[n+31:n]
_MM_UPCONV_EPI32_UINT8:
n := mod*8
dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_SINT8:
n := mod*8
dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_UINT16:
n := mod*16
dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
_MM_UPCONV_EPI32_SINT16:
n := mod*16
dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
ESAC
ESAC
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqa64, vbroadcasti64x4, vpbroadcastq
__m512i _mm512_extload_epi64 (void const * mt, _MM_UPCONV_EPI64_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
Synopsis
__m512i _mm512_extload_epi64 (void const * mt, _MM_UPCONV_EPI64_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, m512
vbroadcasti64x4 zmm {k}, m512
vpbroadcastq zmm {k}, m512
CPUID Flags: KNCNI
Description
Depending on bc, loads 1, 4, or 8 elements of type and size determined by conv from memory address mt and converts all elements to 64-bit integer elements, storing the results in dst. hint indicates to the processor whether the data is non-temporal.
Operation
addr = MEM[mt]
FOR j := 0 to 7
i := j*64
CASE bc OF
_MM_BROADCAST64_NONE:
CASE conv OF
_MM_UPCONV_EPI64_NONE:
n := j*64
dst[i+63:i] := addr[n+63:n]
ESAC
_MM_BROADCAST_1X8:
CASE conv OF
_MM_UPCONV_EPI64_NONE:
n := j*64
dst[i+63:i] := addr[63:0]
ESAC
_MM_BROADCAST_4X8:
mod := j%4
CASE conv OF
_MM_UPCONV_EPI64_NONE:
n := mod*64
dst[i+63:i] := addr[n+63:n]
ESAC
ESAC
ENDFOR
dst[MAX:512] := 0
vmovdqa64, vbroadcasti64x4, vpbroadcastq
__m512i _mm512_mask_extload_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
Synopsis
__m512i _mm512_mask_extload_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
vbroadcasti64x4 zmm {k}, m512
vpbroadcastq zmm {k}, m512
CPUID Flags: KNCNI
Description
Depending on bc, loads 1, 4, or 8 elements of type and size determined by conv from memory address mt and converts all elements to 64-bit integer elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
addr = MEM[mt]
FOR j := 0 to 7
i := j*64
IF k[j]
CASE bc OF
_MM_BROADCAST64_NONE:
CASE conv OF
_MM_UPCONV_EPI64_NONE:
n := j*64
dst[i+63:i] := addr[n+63:n]
ESAC
_MM_BROADCAST_1X8:
CASE conv OF
_MM_UPCONV_EPI64_NONE:
n := j*64
dst[i+63:i] := addr[63:0]
ESAC
_MM_BROADCAST_4X8:
mod := j%4
CASE conv OF
_MM_UPCONV_EPI64_NONE:
n := mod*64
dst[i+63:i] := addr[n+63:n]
ESAC
ESAC
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovapd, vbroadcastf64x4, vbroadcastsd
__m512d _mm512_extload_pd (void const * mt, _MM_UPCONV_PD_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
Synopsis
__m512d _mm512_extload_pd (void const * mt, _MM_UPCONV_PD_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
vbroadcastf64x4 zmm {k}, m512
vbroadcastsd zmm {k}, m512
CPUID Flags: KNCNI
Description
Depending on bc, loads 1, 4, or 8 elements of type and size determined by conv from memory address mt and converts all elements to double-precision (64-bit) floating-point elements, storing the results in dst. hint indicates to the processor whether the data is non-temporal.
Operation
addr = MEM[mt]
FOR j := 0 to 7
i := j*64
CASE bc OF
_MM_BROADCAST64_NONE:
CASE conv OF
_MM_UPCONV_PD_NONE:
n := j*64
dst[i+63:i] := addr[n+63:n]
ESAC
_MM_BROADCAST_1X8:
CASE conv OF
_MM_UPCONV_PD_NONE:
n := j*64
dst[i+63:i] := addr[63:0]
ESAC
_MM_BROADCAST_4X8:
mod := j%4
CASE conv OF
_MM_UPCONV_PD_NONE:
n := mod*64
dst[i+63:i] := addr[n+63:n]
ESAC
ESAC
ENDFOR
dst[MAX:512] := 0
vmovapd, vbroadcastf64x4, vbroadcastsd
__m512d _mm512_mask_extload_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
Synopsis
__m512d _mm512_mask_extload_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, _MM_BROADCAST64_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
vbroadcastf64x4 zmm {k}, m512
vbroadcastsd zmm {k}, m512
CPUID Flags: KNCNI
Description
Depending on bc, loads 1, 4, or 8 elements of type and size determined by conv from memory address mt and converts all elements to double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
addr = MEM[mt]
FOR j := 0 to 7
i := j*64
IF k[j]
CASE bc OF
_MM_BROADCAST64_NONE:
CASE conv OF
_MM_UPCONV_PD_NONE:
n := j*64
dst[i+63:i] := addr[n+63:n]
ESAC
_MM_BROADCAST_1X8:
CASE conv OF
_MM_UPCONV_PD_NONE:
n := j*64
dst[i+63:i] := addr[63:0]
ESAC
_MM_BROADCAST_4X8:
mod := j%4
CASE conv OF
_MM_UPCONV_PD_NONE:
n := mod*64
dst[i+63:i] := addr[n+63:n]
ESAC
ESAC
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovaps, vbroadcastf32x4, vbroadcastss
__m512 _mm512_extload_ps (void const * mt, _MM_UPCONV_PS_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
Synopsis
__m512 _mm512_extload_ps (void const * mt, _MM_UPCONV_PS_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
vbroadcastf32x4 zmm {k}, m512
vbroadcastss zmm {k}, m512
CPUID Flags: KNCNI
Description
Depending on bc, loads 1, 4, or 16 elements of type and size determined by conv from memory address mt and converts all elements to single-precision (32-bit) floating-point elements, storing the results in dst. hint indicates to the processor whether the data is non-temporal.
Operation
addr = MEM[mt]
FOR j := 0 to 15
i := j*32
CASE bc OF
_MM_BROADCAST32_NONE:
CASE conv OF
_MM_UPCONV_PS_NONE:
n := j*32
dst[i+31:i] := addr[n+31:n]
_MM_UPCONV_PS_FLOAT16:
n := j*16
dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_UINT8:
n := j*8
dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_SINT8:
n := j*8
dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_UINT16:
n := j*16
dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_SINT16:
n := j*16
dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
ESAC
_MM_BROADCAST_1X16:
CASE conv OF
_MM_UPCONV_PS_NONE:
n := j*32
dst[i+31:i] := addr[31:0]
_MM_UPCONV_PS_FLOAT16:
n := j*16
dst[i+31:i] := Float16ToFloat32(addr[15:0])
_MM_UPCONV_PS_UINT8:
n := j*8
dst[i+31:i] := UInt8ToFloat32(addr[7:0])
_MM_UPCONV_PS_SINT8:
n := j*8
dst[i+31:i] := SInt8ToFloat32(addr[7:0])
_MM_UPCONV_PS_UINT16:
n := j*16
dst[i+31:i] := UInt16ToFloat32(addr[15:0])
_MM_UPCONV_PS_SINT16:
n := j*16
dst[i+31:i] := SInt16ToFloat32(addr[15:0])
ESAC
_MM_BROADCAST_4X16:
mod := j%4
CASE conv OF
_MM_UPCONV_PS_NONE:
n := mod*32
dst[i+31:i] := addr[n+31:n]
_MM_UPCONV_PS_FLOAT16:
n := mod*16
dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_UINT8:
n := mod*8
dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_SINT8:
n := mod*8
dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_UINT16:
n := mod*16
dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_SINT16:
n := mod*16
dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
ESAC
ESAC
ENDFOR
dst[MAX:512] := 0
vmovaps, vbroadcastf32x4, vbroadcastss
__m512 _mm512_mask_extload_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
Synopsis
__m512 _mm512_mask_extload_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, _MM_BROADCAST32_ENUM bc, int hint)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
vbroadcastf32x4 zmm {k}, m512
vbroadcastss zmm {k}, m512
CPUID Flags: KNCNI
Description
Depending on bc, loads 1, 4, or 16 elements of type and size determined by conv from memory address mt and converts all elements to single-precision (32-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
addr = MEM[mt]
FOR j := 0 to 15
i := j*32
IF k[j]
CASE bc OF
_MM_BROADCAST32_NONE:
CASE conv OF
_MM_UPCONV_PS_NONE:
n := j*32
dst[i+31:i] := addr[n+31:n]
_MM_UPCONV_PS_FLOAT16:
n := j*16
dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_UINT8:
n := j*8
dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_SINT8:
n := j*8
dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_UINT16:
n := j*16
dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_SINT16:
n := j*16
dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
ESAC
_MM_BROADCAST_1X16:
CASE conv OF
_MM_UPCONV_PS_NONE:
n := j*32
dst[i+31:i] := addr[31:0]
_MM_UPCONV_PS_FLOAT16:
n := j*16
dst[i+31:i] := Float16ToFloat32(addr[15:0])
_MM_UPCONV_PS_UINT8:
n := j*8
dst[i+31:i] := UInt8ToFloat32(addr[7:0])
_MM_UPCONV_PS_SINT8:
n := j*8
dst[i+31:i] := SInt8ToFloat32(addr[7:0])
_MM_UPCONV_PS_UINT16:
n := j*16
dst[i+31:i] := UInt16ToFloat32(addr[15:0])
_MM_UPCONV_PS_SINT16:
n := j*16
dst[i+31:i] := SInt16ToFloat32(addr[15:0])
ESAC
_MM_BROADCAST_4X16:
mod := j%4
CASE conv OF
_MM_UPCONV_PS_NONE:
n := mod*32
dst[i+31:i] := addr[n+31:n]
_MM_UPCONV_PS_FLOAT16:
n := mod*16
dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_UINT8:
n := mod*8
dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_SINT8:
n := mod*8
dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_UINT16:
n := mod*16
dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_SINT16:
n := mod*16
dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
ESAC
ESAC
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackhd
__m512i _mm512_extloadunpackhi_epi32 (__m512i src, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
Synopsis
__m512i _mm512_extloadunpackhi_epi32 (__m512i src, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_EPI32_NONE: RETURN MEM[addr + 4*offset]
_MM_UPCONV_EPI32_UINT8: RETURN UInt8ToInt32(MEM[addr + offset])
_MM_UPCONV_EPI32_SINT8: RETURN SInt8ToInt32(MEM[addr + offset])
_MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset])
_MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset])
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_EPI32_NONE: RETURN 4
_MM_UPCONV_EPI32_UINT8: RETURN 1
_MM_UPCONV_EPI32_SINT8: RETURN 1
_MM_UPCONV_EPI32_UINT16: RETURN 2
_MM_UPCONV_EPI32_SINT16: RETURN 2
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
upSize := UPCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 15
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*upSize % 64) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*32
dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
FI
loadOffset := loadOffset + 1
ENDFOR
dst[MAX:512] := 0
vloadunpackhd
__m512i _mm512_mask_extloadunpackhi_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
Synopsis
__m512i _mm512_mask_extloadunpackhi_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_EPI32_NONE: RETURN MEM[addr + 4*offset]
_MM_UPCONV_EPI32_UINT8: RETURN UInt8ToInt32(MEM[addr + offset])
_MM_UPCONV_EPI32_SINT8: RETURN SInt8ToInt32(MEM[addr + offset])
_MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset])
_MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset])
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_EPI32_NONE: RETURN 4
_MM_UPCONV_EPI32_UINT8: RETURN 1
_MM_UPCONV_EPI32_SINT8: RETURN 1
_MM_UPCONV_EPI32_UINT16: RETURN 2
_MM_UPCONV_EPI32_SINT16: RETURN 2
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
upSize := UPCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 15
IF k[j]
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*upSize % 64) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*32
dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
FI
loadOffset := loadOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackhq
__m512i _mm512_extloadunpackhi_epi64 (__m512i src, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
Synopsis
__m512i _mm512_extloadunpackhi_epi64 (__m512i src, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhq zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_EPI64_NONE: RETURN MEM[addr + 8*offset]
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_EPI64_NONE: RETURN 8
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
upSize := UPCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 7
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*upSize) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*64
dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
FI
loadOffset := loadOffset + 1
ENDFOR
dst[MAX:512] := 0
vloadunpackhq
__m512i _mm512_mask_extloadunpackhi_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
Synopsis
__m512i _mm512_mask_extloadunpackhi_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhq zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_EPI64_NONE: RETURN MEM[addr + 8*offset]
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_EPI64_NONE: RETURN 8
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
upSize := UPCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 7
IF k[j]
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*upSize) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*64
dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
FI
loadOffset := loadOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackhpd
__m512d _mm512_extloadunpackhi_pd (__m512d src, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
Synopsis
__m512d _mm512_extloadunpackhi_pd (__m512d src, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhpd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset]
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_PD_NONE: RETURN 8
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
upSize := UPCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 7
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*upSize) % 64 == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*64
dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
FI
loadOffset := loadOffset + 1
ENDFOR
dst[MAX:512] := 0
vloadunpackhpd
__m512d _mm512_mask_extloadunpackhi_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
Synopsis
__m512d _mm512_mask_extloadunpackhi_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhpd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset]
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_PD_NONE: RETURN 8
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
upSize := UPCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 7
IF k[j]
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*upSize) % 64 == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*64
dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
FI
loadOffset := loadOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackhps
__m512 _mm512_extloadunpackhi_ps (__m512 src, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
Synopsis
__m512 _mm512_extloadunpackhi_ps (__m512 src, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhps zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_PS_NONE: RETURN MEM[addr + 4*offset]
_MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset])
_MM_UPCONV_PS_UINT8: RETURN UInt8ToFloat32(MEM[addr + offset])
_MM_UPCONV_PS_SINT8: RETURN SInt8ToFloat32(MEM[addr + offset])
_MM_UPCONV_PS_UINT16: RETURN UInt16ToFloat32(MEM[addr + 2*offset])
_MM_UPCONV_PS_SINT16: RETURN SInt16ToFloat32(MEM[addr + 2*offset])
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_PS_NONE: RETURN 4
_MM_UPCONV_PS_FLOAT16: RETURN 2
_MM_UPCONV_PS_UINT8: RETURN 1
_MM_UPCONV_PS_SINT8: RETURN 1
_MM_UPCONV_PS_UINT16: RETURN 2
_MM_UPCONV_PS_SINT16: RETURN 2
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
upSize := UPCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 15
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*upSize % 64) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*32
dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
FI
loadOffset := loadOffset + 1
ENDFOR
dst[MAX:512] := 0
vloadunpackhps
__m512 _mm512_mask_extloadunpackhi_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
Synopsis
__m512 _mm512_mask_extloadunpackhi_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackhps zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64, up-converted depending on the value of conv, and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_PS_NONE: RETURN MEM[addr + 4*offset]
_MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset])
_MM_UPCONV_PS_UINT8: RETURN UInt8ToFloat32(MEM[addr + offset])
_MM_UPCONV_PS_SINT8: RETURN SInt8ToFloat32(MEM[addr + offset])
_MM_UPCONV_PS_UINT16: RETURN UInt16ToFloat32(MEM[addr + 2*offset])
_MM_UPCONV_PS_SINT16: RETURN SInt16ToFloat32(MEM[addr + 2*offset])
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_PS_NONE: RETURN 4
_MM_UPCONV_PS_FLOAT16: RETURN 2
_MM_UPCONV_PS_UINT8: RETURN 1
_MM_UPCONV_PS_SINT8: RETURN 1
_MM_UPCONV_PS_UINT16: RETURN 2
_MM_UPCONV_PS_SINT16: RETURN 2
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
upSize := UPCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 15
IF k[j]
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*upSize % 64) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*32
dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
FI
loadOffset := loadOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackld
__m512i _mm512_extloadunpacklo_epi32 (__m512i src, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
Synopsis
__m512i _mm512_extloadunpacklo_epi32 (__m512i src, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackld zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_EPI32_NONE: RETURN MEM[addr + 4*offset]
_MM_UPCONV_EPI32_UINT8: RETURN UInt8ToInt32(MEM[addr + offset])
_MM_UPCONV_EPI32_SINT8: RETURN SInt8ToInt32(MEM[addr + offset])
_MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset])
_MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset])
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_EPI32_NONE: RETURN 4
_MM_UPCONV_EPI32_UINT8: RETURN 1
_MM_UPCONV_EPI32_SINT8: RETURN 1
_MM_UPCONV_EPI32_UINT16: RETURN 2
_MM_UPCONV_EPI32_SINT16: RETURN 2
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
upSize := UPCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 15
i := j*32
dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
loadOffset := loadOffset + 1
IF (mt + loadOffset * upSize) % 64 == 0
break
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackld
__m512i _mm512_mask_extloadunpacklo_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
Synopsis
__m512i _mm512_mask_extloadunpacklo_epi32 (__m512i src, __mmask16 k, void const * mt, _MM_UPCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpackld zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_EPI32_NONE: RETURN MEM[addr + 4*offset]
_MM_UPCONV_EPI32_UINT8: RETURN UInt8ToInt32(MEM[addr + offset])
_MM_UPCONV_EPI32_SINT8: RETURN SInt8ToInt32(MEM[addr + offset])
_MM_UPCONV_EPI32_UINT16: RETURN UInt16ToInt32(MEM[addr + 2*offset])
_MM_UPCONV_EPI32_SINT16: RETURN SInt16ToInt32(MEM[addr + 2*offset])
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_EPI32_NONE: RETURN 4
_MM_UPCONV_EPI32_UINT8: RETURN 1
_MM_UPCONV_EPI32_SINT8: RETURN 1
_MM_UPCONV_EPI32_UINT16: RETURN 2
_MM_UPCONV_EPI32_SINT16: RETURN 2
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
upSize := UPCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 15
IF k[j]
i := j*32
dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
loadOffset := loadOffset + 1
IF (mt + loadOffset * upSize) % 64 == 0
break
FI
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklq
__m512i _mm512_extloadunpacklo_epi64 (__m512i src, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
Synopsis
__m512i _mm512_extloadunpacklo_epi64 (__m512i src, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklq zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_EPI64_NONE: RETURN MEM[addr + 8*offset]
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_EPI64_NONE: RETURN 8
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
upSize := UPCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 7
i := j*64
dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
loadOffset := loadOffset + 1
IF (addr + loadOffset*upSize % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklq
__m512i _mm512_mask_extloadunpacklo_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
Synopsis
__m512i _mm512_mask_extloadunpacklo_epi64 (__m512i src, __mmask8 k, void const * mt, _MM_UPCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklq zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_EPI64_NONE: RETURN MEM[addr + 8*offset]
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_EPI64_NONE: RETURN 8
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
upSize := UPCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 7
IF k[j]
i := j*64
dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
loadOffset := loadOffset + 1
IF (addr + loadOffset*upSize % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklpd
__m512d _mm512_extloadunpacklo_pd (__m512d src, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
Synopsis
__m512d _mm512_extloadunpacklo_pd (__m512d src, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklpd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed double-precision (64-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset]
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_PD_NONE: RETURN 8
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
upSize := UPCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 7
i := j*64
dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
loadOffset := loadOffset + 1
IF (mt + loadOffset * upSize) % 64 == 0
break
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklpd
__m512d _mm512_mask_extloadunpacklo_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
Synopsis
__m512d _mm512_mask_extloadunpacklo_pd (__m512d src, __mmask8 k, void const * mt, _MM_UPCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklpd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed double-precision (64-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elemenst are skipped when the corresponding mask bit is not set).
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_PD_NONE: RETURN MEM[addr + 8*offset]
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_PD_NONE: RETURN 8
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
upSize := UPCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 7
IF k[j]
i := j*64
dst[i+63:i] := UPCONVERT(addr, loadOffset, conv)
loadOffset := loadOffset + 1
IF (mt + loadOffset * upSize) % 64 == 0
break
FI
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklps
__m512 _mm512_extloadunpacklo_ps (__m512 src, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
Synopsis
__m512 _mm512_extloadunpacklo_ps (__m512 src, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklps zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal.
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_PS_NONE: RETURN MEM[addr + 4*offset]
_MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset])
_MM_UPCONV_PS_UINT8: RETURN UInt8ToFloat32(MEM[addr + offset])
_MM_UPCONV_PS_SINT8: RETURN SInt8ToFloat32(MEM[addr + offset])
_MM_UPCONV_PS_UINT16: RETURN UInt16ToFloat32(MEM[addr + 2*offset])
_MM_UPCONV_PS_SINT16: RETURN SInt16ToFloat32(MEM[addr + 2*offset])
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_PS_NONE: RETURN 4
_MM_UPCONV_PS_FLOAT16: RETURN 2
_MM_UPCONV_PS_UINT8: RETURN 1
_MM_UPCONV_PS_SINT8: RETURN 1
_MM_UPCONV_PS_UINT16: RETURN 2
_MM_UPCONV_PS_SINT16: RETURN 2
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
upSize := UPCONVERTSIZE(conv)
addr = MEM[mt]
FOR j := 0 to 15
i := j*32
dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
loadOffset := loadOffset + 1
IF (mt + loadOffset * upSize) % 64 == 0
break
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklps
__m512 _mm512_mask_extloadunpacklo_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
Synopsis
__m512 _mm512_mask_extloadunpacklo_ps (__m512 src, __mmask16 k, void const * mt, _MM_UPCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vloadunpacklps zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt, up-converted depending on the value of conv, and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. hint indicates to the processor whether the loaded data is non-temporal. Elements are copied to dst according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
UPCONVERT(address, offset, convertTo) {
CASE conv OF
_MM_UPCONV_PS_NONE: RETURN MEM[addr + 4*offset]
_MM_UPCONV_PS_FLOAT16: RETURN Float16ToFloat32(MEM[addr + 4*offset])
_MM_UPCONV_PS_UINT8: RETURN UInt8ToFloat32(MEM[addr + offset])
_MM_UPCONV_PS_SINT8: RETURN SInt8ToFloat32(MEM[addr + offset])
_MM_UPCONV_PS_UINT16: RETURN UInt16ToFloat32(MEM[addr + 2*offset])
_MM_UPCONV_PS_SINT16: RETURN SInt16ToFloat32(MEM[addr + 2*offset])
ESAC
}
UPCONVERTSIZE(convertTo) {
CASE conv OF
_MM_UPCONV_PS_NONE: RETURN 4
_MM_UPCONV_PS_FLOAT16: RETURN 2
_MM_UPCONV_PS_UINT8: RETURN 1
_MM_UPCONV_PS_SINT8: RETURN 1
_MM_UPCONV_PS_UINT16: RETURN 2
_MM_UPCONV_PS_SINT16: RETURN 2
ESAC
}
dst[511:0] := src[511:0]
loadOffset := 0
upSize := UPCONVERTSIZE(conv)
addr = MEM[mt]
FOR j := 0 to 15
IF k[j]
i := j*32
dst[i+31:i] := UPCONVERT(addr, loadOffset, conv)
loadOffset := loadOffset + 1
IF (mt + loadOffset * upSize) % 64 == 0
break
FI
FI
ENDFOR
dst[MAX:512] := 0
vpackstorehd
void _mm512_extpackstorehi_epi32 (void * mt, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
Synopsis
void _mm512_extpackstorehi_epi32 (void * mt, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed 32-bit integer elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal.
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_EPI32_NONE: RETURN element[i+31:i]
_MM_UPCONV_EPI32_UINT8: RETURN UInt32ToUInt8(element[i+31:i])
_MM_UPCONV_EPI32_SINT8: RETURN SInt32ToSInt8(element[i+31:i])
_MM_UPCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
_MM_UPCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_EPI32_NONE: RETURN 4
_MM_UPCONV_EPI32_UINT8: RETURN 1
_MM_UPCONV_EPI32_SINT8: RETURN 1
_MM_UPCONV_EPI32_UINT16: RETURN 2
_MM_UPCONV_EPI32_SINT16: RETURN 2
ESAC
}
storeOffset := 0
foundNext64BytesBoundary := false
downSize := DOWNCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 15
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*32
tmp := DOWNCONVERT(v1[i+31:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
4: MEM[storeAddr] := tmp[31:0]
2: MEM[storeAddr] := tmp[15:0]
1: MEM[storeAddr] := tmp[7:0]
ESAC
FI
storeOffset := storeOffset + 1
ENDFOR
dst[MAX:512] := 0
vpackstorehd
void _mm512_mask_extpackstorehi_epi32 (void * mt, __mmask16 k, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
Synopsis
void _mm512_mask_extpackstorehi_epi32 (void * mt, __mmask16 k, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed 32-bit integer elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresonding mask bit is not set).
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_EPI32_NONE: RETURN element[i+31:i]
_MM_UPCONV_EPI32_UINT8: RETURN UInt32ToUInt8(element[i+31:i])
_MM_UPCONV_EPI32_SINT8: RETURN SInt32ToSInt8(element[i+31:i])
_MM_UPCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
_MM_UPCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_EPI32_NONE: RETURN 4
_MM_UPCONV_EPI32_UINT8: RETURN 1
_MM_UPCONV_EPI32_SINT8: RETURN 1
_MM_UPCONV_EPI32_UINT16: RETURN 2
_MM_UPCONV_EPI32_SINT16: RETURN 2
ESAC
}
storeOffset := 0
foundNext64BytesBoundary := false
downSize := DOWNCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 15
IF k[j]
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*32
tmp := DOWNCONVERT(v1[i+31:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
4: MEM[storeAddr] := tmp[31:0]
2: MEM[storeAddr] := tmp[15:0]
1: MEM[storeAddr] := tmp[7:0]
ESAC
FI
storeOffset := storeOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vpackstorehq
void _mm512_extpackstorehi_epi64 (void * mt, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
Synopsis
void _mm512_extpackstorehi_epi64 (void * mt, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed 64-bit integer elements of v1 into a quadword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal.
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_EPI64_NONE: RETURN element[i+63:i]
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_EPI64_NONE: RETURN 8
ESAC
}
storeOffset := 0
foundNext64BytesBoundary := false
downSize := DOWNCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 7
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*64
tmp := DOWNCONVERT(v1[i+63:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
8: MEM[storeAddr] := tmp[63:0]
ESAC
FI
storeOffset := storeOffset + 1
ENDFOR
dst[MAX:512] := 0
vpackstorehq
void _mm512_mask_extpackstorehi_epi64 (void * mt, __mmask8 k, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
Synopsis
void _mm512_mask_extpackstorehi_epi64 (void * mt, __mmask8 k, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed 64-bit integer elements of v1 into a quadword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (mt-64)). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresonding mask bit is not set).
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_EPI64_NONE: RETURN element[i+63:i]
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_EPI64_NONE: RETURN 8
ESAC
}
storeOffset := 0
foundNext64BytesBoundary := false
downSize := DOWNCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 7
IF k[j]
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*64
tmp := DOWNCONVERT(v1[i+63:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
8: MEM[storeAddr] := tmp[63:0]
ESAC
FI
storeOffset := storeOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vpackstorehpd
void _mm512_extpackstorehi_pd (void * mt, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
Synopsis
void _mm512_extpackstorehi_pd (void * mt, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehpd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal.
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_PD_NONE: RETURN element[i+63:i]
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_PD_NONE: RETURN 8
ESAC
}
storeOffset := 0
foundNext64BytesBoundary := false
downSize := DOWNCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 7
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*64
tmp := DOWNCONVERT(v1[i+63:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
8: MEM[storeAddr] := tmp[63:0]
ESAC
FI
storeOffset := storeOffset + 1
ENDFOR
dst[MAX:512] := 0
vpackstorehpd
void _mm512_mask_extpackstorehi_pd (void * mt, __mmask8 k, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
Synopsis
void _mm512_mask_extpackstorehi_pd (void * mt, __mmask8 k, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehpd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_PD_NONE: RETURN element[i+63:i]
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_PD_NONE: RETURN 8
ESAC
}
storeOffset := 0
foundNext64BytesBoundary := false
downSize := DOWNCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 7
IF k[j]
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*64
tmp := DOWNCONVERT(v1[i+63:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
8: MEM[storeAddr] := tmp[63:0]
ESAC
FI
storeOffset := storeOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vpackstorehps
void _mm512_extpackstorehi_ps (void * mt, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
Synopsis
void _mm512_extpackstorehi_ps (void * mt, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed single-precision (32-bit) floating-point elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal.
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_PS_NONE: RETURN element[i+31:i]
_MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i])
_MM_UPCONV_PS_UINT8: RETURN UInt32ToUInt8(element[i+31:i])
_MM_UPCONV_PS_SINT8: RETURN SInt32ToSInt8(element[i+31:i])
_MM_UPCONV_PS_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
_MM_UPCONV_PS_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_PS_NONE: RETURN 4
_MM_UPCONV_PS_FLOAT16: RETURN 2
_MM_UPCONV_PS_UINT8: RETURN 1
_MM_UPCONV_PS_SINT8: RETURN 1
_MM_UPCONV_PS_UINT16: RETURN 2
_MM_UPCONV_PS_SINT16: RETURN 2
ESAC
}
storeOffset := 0
foundNext64BytesBoundary := false
downSize := DOWNCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 15
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*32
tmp := DOWNCONVERT(v1[i+31:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
4: MEM[storeAddr] := tmp[31:0]
2: MEM[storeAddr] := tmp[15:0]
1: MEM[storeAddr] := tmp[7:0]
ESAC
FI
storeOffset := storeOffset + 1
ENDFOR
dst[MAX:512] := 0
vpackstorehps
void _mm512_mask_extpackstorehi_ps (void * mt, __mmask16 k, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
Synopsis
void _mm512_mask_extpackstorehi_ps (void * mt, __mmask16 k, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorehps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed single-precision (32-bit) floating-point elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_PS_NONE: RETURN element[i+31:i]
_MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i])
_MM_UPCONV_PS_UINT8: RETURN UInt32ToUInt8(element[i+31:i])
_MM_UPCONV_PS_SINT8: RETURN SInt32ToSInt8(element[i+31:i])
_MM_UPCONV_PS_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
_MM_UPCONV_PS_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_PS_NONE: RETURN 4
_MM_UPCONV_PS_FLOAT16: RETURN 2
_MM_UPCONV_PS_UINT8: RETURN 1
_MM_UPCONV_PS_SINT8: RETURN 1
_MM_UPCONV_PS_UINT16: RETURN 2
_MM_UPCONV_PS_SINT16: RETURN 2
ESAC
}
storeOffset := 0
foundNext64BytesBoundary := false
downSize := DOWNCONVERTSIZE(conv)
addr = mt-64
FOR j := 0 to 15
IF k[j]
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*downSize) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*32
tmp := DOWNCONVERT(v1[i+31:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
4: MEM[storeAddr] := tmp[31:0]
2: MEM[storeAddr] := tmp[15:0]
1: MEM[storeAddr] := tmp[7:0]
ESAC
FI
storeOffset := storeOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vpackstoreld
void _mm512_extpackstorelo_epi32 (void * mt, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
Synopsis
void _mm512_extpackstorelo_epi32 (void * mt, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstoreld m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed 32-bit integer elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal.
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_EPI32_NONE: RETURN element[i+31:i]
_MM_UPCONV_EPI32_UINT8: RETURN UInt32ToUInt8(element[i+31:i])
_MM_UPCONV_EPI32_SINT8: RETURN SInt32ToSInt8(element[i+31:i])
_MM_UPCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
_MM_UPCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_EPI32_NONE: RETURN 4
_MM_UPCONV_EPI32_UINT8: RETURN 1
_MM_UPCONV_EPI32_SINT8: RETURN 1
_MM_UPCONV_EPI32_UINT16: RETURN 2
_MM_UPCONV_EPI32_SINT16: RETURN 2
ESAC
}
storeOffset := 0
downSize := DOWNCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 15
i := j*32
tmp := DOWNCONVERT(v1[i+31:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
4: MEM[storeAddr] := tmp[31:0]
2: MEM[storeAddr] := tmp[15:0]
1: MEM[storeAddr] := tmp[7:0]
ESAC
storeOffset := storeOffset + 1
IF ((addr + storeOffset * downSize) % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vpackstoreld
void _mm512_mask_extpackstorelo_epi32 (void * mt, __mmask16 k, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
Synopsis
void _mm512_mask_extpackstorelo_epi32 (void * mt, __mmask16 k, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstoreld m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed 32-bit integer elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal. Elements are written to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_EPI32_NONE: RETURN element[i+31:i]
_MM_UPCONV_EPI32_UINT8: RETURN UInt32ToUInt8(element[i+31:i])
_MM_UPCONV_EPI32_SINT8: RETURN SInt32ToSInt8(element[i+31:i])
_MM_UPCONV_EPI32_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
_MM_UPCONV_EPI32_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_EPI32_NONE: RETURN 4
_MM_UPCONV_EPI32_UINT8: RETURN 1
_MM_UPCONV_EPI32_SINT8: RETURN 1
_MM_UPCONV_EPI32_UINT16: RETURN 2
_MM_UPCONV_EPI32_SINT16: RETURN 2
ESAC
}
storeOffset := 0
downSize := DOWNCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 15
IF k[j]
i := j*32
tmp := DOWNCONVERT(v1[i+31:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
4: MEM[storeAddr] := tmp[31:0]
2: MEM[storeAddr] := tmp[15:0]
1: MEM[storeAddr] := tmp[7:0]
ESAC
storeOffset := storeOffset + 1
IF ((addr + storeOffset * downSize) % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelq
void _mm512_extpackstorelo_epi64 (void * mt, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
Synopsis
void _mm512_extpackstorelo_epi64 (void * mt, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed 64-bit integer elements of v1 into a quadword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal.
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_EPI64_NONE: RETURN element[i+63:i]
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_EPI64_NONE: RETURN 8
ESAC
}
storeOffset := 0
downSize := DOWNCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 7
i := j*63
tmp := DOWNCONVERT(v1[i+63:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
8: MEM[storeAddr] := tmp[63:0]
ESAC
storeOffset := storeOffset + 1
IF ((addr + storeOffset * downSize) % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelq
void _mm512_mask_extpackstorelo_epi64 (void * mt, __mmask8 k, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
Synopsis
void _mm512_mask_extpackstorelo_epi64 (void * mt, __mmask8 k, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed 64-bit integer elements of v1 into a quadword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped whent he corresponding mask bit is not set).
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_EPI64_NONE: RETURN element[i+63:i]
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_EPI64_NONE: RETURN 8
ESAC
}
storeOffset := 0
downSize := DOWNCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 7
IF k[j]
i := j*63
tmp := DOWNCONVERT(v1[i+63:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
8: MEM[storeAddr] := tmp[63:0]
ESAC
storeOffset := storeOffset + 1
IF ((addr + storeOffset * downSize) % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelpd
void _mm512_extpackstorelo_pd (void * mt, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
Synopsis
void _mm512_extpackstorelo_pd (void * mt, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelpd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal.
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_PD_NONE: RETURN element[i+63:i]
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_PD_NONE: RETURN 8
ESAC
}
storeOffset := 0
downSize := DOWNCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 7
i := j*63
tmp := DOWNCONVERT(v1[i+63:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
8: MEM[storeAddr] := tmp[63:0]
ESAC
storeOffset := storeOffset + 1
IF ((addr + storeOffset * downSize) % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelpd
void _mm512_mask_extpackstorelo_pd (void * mt, __mmask8 k, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
Synopsis
void _mm512_mask_extpackstorelo_pd (void * mt, __mmask8 k, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelpd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_PD_NONE: RETURN element[i+63:i]
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_PD_NONE: RETURN 8
ESAC
}
storeOffset := 0
downSize := DOWNCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 7
IF k[j]
i := j*63
tmp := DOWNCONVERT(v1[i+63:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
8: MEM[storeAddr] := tmp[63:0]
ESAC
storeOffset := storeOffset + 1
IF ((addr + storeOffset * downSize) % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelps
void _mm512_extpackstorelo_ps (void * mt, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
Synopsis
void _mm512_extpackstorelo_ps (void * mt, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed single-precision (32-bit) floating-point elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal.
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_PS_NONE: RETURN element[i+31:i]
_MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i])
_MM_UPCONV_PS_UINT8: RETURN UInt32ToUInt8(element[i+31:i])
_MM_UPCONV_PS_SINT8: RETURN SInt32ToSInt8(element[i+31:i])
_MM_UPCONV_PS_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
_MM_UPCONV_PS_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_PS_NONE: RETURN 4
_MM_UPCONV_PS_FLOAT16: RETURN 2
_MM_UPCONV_PS_UINT8: RETURN 1
_MM_UPCONV_PS_SINT8: RETURN 1
_MM_UPCONV_PS_UINT16: RETURN 2
_MM_UPCONV_PS_SINT16: RETURN 2
ESAC
}
storeOffset := 0
downSize := DOWNCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 15
i := j*32
tmp := DOWNCONVERT(v1[i+31:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
4: MEM[storeAddr] := tmp[31:0]
2: MEM[storeAddr] := tmp[15:0]
1: MEM[storeAddr] := tmp[7:0]
ESAC
storeOffset := storeOffset + 1
IF ((addr + storeOffset * downSize) % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelps
void _mm512_mask_extpackstorelo_ps (void * mt, __mmask16 k, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
Synopsis
void _mm512_mask_extpackstorelo_ps (void * mt, __mmask16 k, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vpackstorelps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Down-converts and stores packed single-precision (32-bit) floating-point elements of v1 into a byte/word/doubleword stream according to conv at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). hint indicates to the processor whether the data is non-temporal. Elements are stored to memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
DOWNCONVERT(element, convertTo) {
CASE converTo OF
_MM_UPCONV_PS_NONE: RETURN element[i+31:i]
_MM_UPCONV_PS_FLOAT16: RETURN Float32ToFloat16(element[i+31:i])
_MM_UPCONV_PS_UINT8: RETURN UInt32ToUInt8(element[i+31:i])
_MM_UPCONV_PS_SINT8: RETURN SInt32ToSInt8(element[i+31:i])
_MM_UPCONV_PS_UINT16: RETURN UInt32ToUInt16(element[i+31:i])
_MM_UPCONV_PS_SINT16: RETURN SInt32ToSInt16(element[i+31:i])
ESAC
}
DOWNCONVERTSIZE(convertTo) {
CASE converTo OF
_MM_UPCONV_PS_NONE: RETURN 4
_MM_UPCONV_PS_FLOAT16: RETURN 2
_MM_UPCONV_PS_UINT8: RETURN 1
_MM_UPCONV_PS_SINT8: RETURN 1
_MM_UPCONV_PS_UINT16: RETURN 2
_MM_UPCONV_PS_SINT16: RETURN 2
ESAC
}
storeOffset := 0
downSize := DOWNCONVERTSIZE(conv)
addr = mt
FOR j := 0 to 15
IF k[j]
i := j*32
tmp := DOWNCONVERT(v1[i+31:i], conv)
storeAddr := addr + storeOffset * downSize
CASE downSize OF
4: MEM[storeAddr] := tmp[31:0]
2: MEM[storeAddr] := tmp[15:0]
1: MEM[storeAddr] := tmp[7:0]
ESAC
storeOffset := storeOffset + 1
IF ((addr + storeOffset * downSize) % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
pextrw
int _mm_extract_epi16 (__m128i a, int imm8)
Synopsis
int _mm_extract_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pextrw r32, xmm, imm
CPUID Flags: SSE2
Description
Extract a 16-bit integer from a, selected with imm8, and store the result in the lower element of dst.
Operation
dst[15:0] := (a[127:0] >> (imm8[2:0] * 16))[15:0]
dst[31:16] := 0
Performance
...
__int16 _mm256_extract_epi16 (__m256i a, const int index)
Synopsis
__int16 _mm256_extract_epi16 (__m256i a, const int index)
#include "immintrin.h"
CPUID Flags: AVX
Description
Extract a 16-bit integer from a, selected with index, and store the result in dst.
Operation
dst[15:0] := (a[255:0] >> (index * 16))[15:0]
pextrd
int _mm_extract_epi32 (__m128i a, const int imm8)
Synopsis
int _mm_extract_epi32 (__m128i a, const int imm8)
#include "smmintrin.h"
Instruction: pextrd r32, xmm, imm
CPUID Flags: SSE4.1
Description
Extract a 32-bit integer from a, selected with imm8, and store the result in dst.
Operation
dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0]
Performance
...
__int32 _mm256_extract_epi32 (__m256i a, const int index)
Synopsis
__int32 _mm256_extract_epi32 (__m256i a, const int index)
#include "immintrin.h"
CPUID Flags: AVX
Description
Extract a 32-bit integer from a, selected with index, and store the result in dst.
Operation
dst[31:0] := (a[255:0] >> (index * 32))[31:0]
pextrq
__int64 _mm_extract_epi64 (__m128i a, const int imm8)
Synopsis
__int64 _mm_extract_epi64 (__m128i a, const int imm8)
#include "smmintrin.h"
Instruction: pextrq r64, xmm, imm
CPUID Flags: SSE4.1
Description
Extract a 64-bit integer from a, selected with imm8, and store the result in dst.
Operation
dst[63:0] := (a[127:0] >> (imm8[0] * 64))[63:0]
Performance
...
__int64 _mm256_extract_epi64 (__m256i a, const int index)
Synopsis
__int64 _mm256_extract_epi64 (__m256i a, const int index)
#include "immintrin.h"
CPUID Flags: AVX
Description
Extract a 64-bit integer from a, selected with index, and store the result in dst.
Operation
dst[63:0] := (a[255:0] >> (index * 64))[63:0]
pextrb
int _mm_extract_epi8 (__m128i a, const int imm8)
Synopsis
int _mm_extract_epi8 (__m128i a, const int imm8)
#include "smmintrin.h"
Instruction: pextrb r32, xmm, imm
CPUID Flags: SSE4.1
Description
Extract an 8-bit integer from a, selected with imm8, and store the result in the lower element of dst.
Operation
dst[7:0] := (a[127:0] >> (imm8[3:0] * 8))[7:0]
dst[31:8] := 0
Performance
...
__int8 _mm256_extract_epi8 (__m256i a, const int index)
Synopsis
__int8 _mm256_extract_epi8 (__m256i a, const int index)
#include "immintrin.h"
CPUID Flags: AVX
Description
Extract an 8-bit integer from a, selected with index, and store the result in dst.
Operation
dst[7:0] := (a[255:0] >> (index * 8))[7:0]
pextrw
int _mm_extract_pi16 (__m64 a, int imm8)
Synopsis
int _mm_extract_pi16 (__m64 a, int imm8)
#include "xmmintrin.h"
Instruction: pextrw r32, mm, imm
CPUID Flags: SSE
Description
Extract a 16-bit integer from a, selected with imm8, and store the result in the lower element of dst.
Operation
dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0]
dst[31:16] := 0
extractps
int _mm_extract_ps (__m128 a, const int imm8)
Synopsis
int _mm_extract_ps (__m128 a, const int imm8)
#include "smmintrin.h"
Instruction: extractps r32, xmm, imm
CPUID Flags: SSE4.1
Description
Extract a single-precision (32-bit) floating-point element from a, selected with imm8, and store the result in dst.
Operation
dst[31:0] := (a[127:0] >> (imm8[1:0] * 32))[31:0]
Performance
vextractf128
__m128d _mm256_extractf128_pd (__m256d a, const int imm8)
Synopsis
__m128d _mm256_extractf128_pd (__m256d a, const int imm8)
#include "immintrin.h"
Instruction: vextractf128 xmm, ymm, imm
CPUID Flags: AVX
Description
Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
Performance
vextractf128
__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
Synopsis
__m128 _mm256_extractf128_ps (__m256 a, const int imm8)
#include "immintrin.h"
Instruction: vextractf128 xmm, ymm, imm
CPUID Flags: AVX
Description
Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
Performance
vextractf128
__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
Synopsis
__m128i _mm256_extractf128_si256 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vextractf128 xmm, ymm, imm
CPUID Flags: AVX
Description
Extract 128 bits (composed of integer data) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
Performance
vextractf32x4
__m128 _mm256_extractf32x4_ps (__m256 a, int imm8)
Synopsis
__m128 _mm256_extractf32x4_ps (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4
CPUID Flags: AVX512VL + AVX512F
Description
Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
vextractf32x4
__m128 _mm256_mask_extractf32x4_ps (__m128 src, __mmask8 k, __m256 a, int imm8)
Synopsis
__m128 _mm256_mask_extractf32x4_ps (__m128 src, __mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4
CPUID Flags: AVX512VL + AVX512F
Description
Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vextractf32x4
__m128 _mm256_maskz_extractf32x4_ps (__mmask8 k, __m256 a, int imm8)
Synopsis
__m128 _mm256_maskz_extractf32x4_ps (__mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4
CPUID Flags: AVX512VL + AVX512F
Description
Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vextractf32x4
__m128 _mm512_extractf32x4_ps (__m512 a, int imm8)
Synopsis
__m128 _mm512_extractf32x4_ps (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0
vextractf32x4
__m128 _mm512_mask_extractf32x4_ps (__m128 src, __mmask8 k, __m512 a, int imm8)
Synopsis
__m128 _mm512_mask_extractf32x4_ps (__m128 src, __mmask8 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vextractf32x4
__m128 _mm512_maskz_extractf32x4_ps (__mmask8 k, __m512 a, int imm8)
Synopsis
__m128 _mm512_maskz_extractf32x4_ps (__mmask8 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vextractf32x8
__m256 _mm512_extractf32x8_ps (__m512 a, int imm8)
Synopsis
__m256 _mm512_extractf32x8_ps (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x8
CPUID Flags: AVX512DQ
Description
Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0
vextractf32x8
__m256 _mm512_mask_extractf32x8_ps (__m256 src, __mmask8 k, __m512 a, int imm8)
Synopsis
__m256 _mm512_mask_extractf32x8_ps (__m256 src, __mmask8 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x8
CPUID Flags: AVX512DQ
Description
Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vextractf32x8
__m256 _mm512_maskz_extractf32x8_ps (__mmask8 k, __m512 a, int imm8)
Synopsis
__m256 _mm512_maskz_extractf32x8_ps (__mmask8 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vextractf32x8
CPUID Flags: AVX512DQ
Description
Extract 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vextractf64x2
__m128d _mm256_extractf64x2_pd (__m256d a, int imm8)
Synopsis
__m128d _mm256_extractf64x2_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
vextractf64x2
__m128d _mm256_mask_extractf64x2_pd (__m128d src, __mmask8 k, __m256d a, int imm8)
Synopsis
__m128d _mm256_mask_extractf64x2_pd (__m128d src, __mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vextractf64x2
__m128d _mm256_maskz_extractf64x2_pd (__mmask8 k, __m256d a, int imm8)
Synopsis
__m128d _mm256_maskz_extractf64x2_pd (__mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vextractf64x2
__m128d _mm512_extractf64x2_pd (__m512d a, int imm8)
Synopsis
__m128d _mm512_extractf64x2_pd (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512DQ
Description
Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0
vextractf64x2
__m128d _mm512_mask_extractf64x2_pd (__m128d src, __mmask8 k, __m512d a, int imm8)
Synopsis
__m128d _mm512_mask_extractf64x2_pd (__m128d src, __mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512DQ
Description
Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vextractf64x2
__m128d _mm512_maskz_extractf64x2_pd (__mmask8 k, __m512d a, int imm8)
Synopsis
__m128d _mm512_maskz_extractf64x2_pd (__mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x2
CPUID Flags: AVX512DQ
Description
Extract 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vextractf64x4
__m256d _mm512_extractf64x4_pd (__m512d a, int imm8)
Synopsis
__m256d _mm512_extractf64x4_pd (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0
vextractf64x4
__m256d _mm512_mask_extractf64x4_pd (__m256d src, __mmask8 k, __m512d a, int imm8)
Synopsis
__m256d _mm512_mask_extractf64x4_pd (__m256d src, __mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vextractf64x4
__m256d _mm512_maskz_extractf64x4_pd (__mmask8 k, __m512d a, int imm8)
Synopsis
__m256d _mm512_maskz_extractf64x4_pd (__mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vextractf64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vextracti128
__m128i _mm256_extracti128_si256 (__m256i a, const int imm8)
Synopsis
__m128i _mm256_extracti128_si256 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vextracti128 xmm, ymm, imm
CPUID Flags: AVX2
Description
Extract 128 bits (composed of integer data) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
Performance
vextracti32x4
__m128i _mm256_extracti32x4_epi32 (__m256i a, int imm8)
Synopsis
__m128i _mm256_extracti32x4_epi32 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4
CPUID Flags: AVX512VL + AVX512F
Description
Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
vextracti32x4
__m128i _mm256_mask_extracti32x4_epi32 (__m128i src, __mmask8 k, __m256i a, int imm8)
Synopsis
__m128i _mm256_mask_extracti32x4_epi32 (__m128i src, __mmask8 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4
CPUID Flags: AVX512VL + AVX512F
Description
Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vextracti32x4
__m128i _mm256_maskz_extracti32x4_epi32 (__mmask8 k, __m256i a, int imm8)
Synopsis
__m128i _mm256_maskz_extracti32x4_epi32 (__mmask8 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4
CPUID Flags: AVX512VL + AVX512F
Description
Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vextracti32x4
__m128i _mm512_extracti32x4_epi32 (__m512i a, int imm8)
Synopsis
__m128i _mm512_extracti32x4_epi32 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0
vextracti32x4
__m128i _mm512_mask_extracti32x4_epi32 (__m128i src, __mmask8 k, __m512i a, int imm8)
Synopsis
__m128i _mm512_mask_extracti32x4_epi32 (__m128i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vextracti32x4
__m128i _mm512_maskz_extracti32x4_epi32 (__mmask8 k, __m512i a, int imm8)
Synopsis
__m128i _mm512_maskz_extracti32x4_epi32 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x4 xmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vextracti32x8
__m256i _mm512_extracti32x8_epi32 (__m512i a, int imm8)
Synopsis
__m256i _mm512_extracti32x8_epi32 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x8
CPUID Flags: AVX512DQ
Description
Extract 256 bits (composed of 8 packed 32-bit integers) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0
vextracti32x8
__m256i _mm512_mask_extracti32x8_epi32 (__m256i src, __mmask8 k, __m512i a, int imm8)
Synopsis
__m256i _mm512_mask_extracti32x8_epi32 (__m256i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x8
CPUID Flags: AVX512DQ
Description
Extract 256 bits (composed of 8 packed 32-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vextracti32x8
__m256i _mm512_maskz_extracti32x8_epi32 (__mmask8 k, __m512i a, int imm8)
Synopsis
__m256i _mm512_maskz_extracti32x8_epi32 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti32x8
CPUID Flags: AVX512DQ
Description
Extract 256 bits (composed of 8 packed 32-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[255:0] := a[255:0]
1: tmp[255:0] := a[511:256]
ESAC
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vextracti64x2
__m128i _mm256_extracti64x2_epi64 (__m256i a, int imm8)
Synopsis
__m128i _mm256_extracti64x2_epi64 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0
vextracti64x2
__m128i _mm256_mask_extracti64x2_epi64 (__m128i src, __mmask8 k, __m256i a, int imm8)
Synopsis
__m128i _mm256_mask_extracti64x2_epi64 (__m128i src, __mmask8 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vextracti64x2
__m128i _mm256_maskz_extracti64x2_epi64 (__mmask8 k, __m256i a, int imm8)
Synopsis
__m128i _mm256_maskz_extracti64x2_epi64 (__mmask8 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
ESAC
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vextracti64x2
__m128i _mm512_extracti64x2_epi64 (__m512i a, int imm8)
Synopsis
__m128i _mm512_extracti64x2_epi64 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512DQ
Description
Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
2: dst[127:0] := a[383:256]
3: dst[127:0] := a[511:384]
ESAC
dst[MAX:128] := 0
vextracti64x2
__m128i _mm512_mask_extracti64x2_epi64 (__m128i src, __mmask8 k, __m512i a, int imm8)
Synopsis
__m128i _mm512_mask_extracti64x2_epi64 (__m128i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512DQ
Description
Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vextracti64x2
__m128i _mm512_maskz_extracti64x2_epi64 (__mmask8 k, __m512i a, int imm8)
Synopsis
__m128i _mm512_maskz_extracti64x2_epi64 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x2
CPUID Flags: AVX512DQ
Description
Extract 128 bits (composed of 2 packed 64-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: tmp[127:0] := a[127:0]
1: tmp[127:0] := a[255:128]
2: tmp[127:0] := a[383:256]
3: tmp[127:0] := a[511:384]
ESAC
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vextracti64x4
__m256i _mm512_extracti64x4_epi64 (__m512i a, int imm8)
Synopsis
__m256i _mm512_extracti64x4_epi64 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the result in dst.
Operation
CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
dst[MAX:256] := 0
vextracti64x4
__m256i _mm512_mask_extracti64x4_epi64 (__m256i src, __mmask8 k, __m512i a, int imm8)
Synopsis
__m256i _mm512_mask_extracti64x4_epi64 (__m256i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vextracti64x4
__m256i _mm512_maskz_extracti64x4_epi64 (__mmask8 k, __m512i a, int imm8)
Synopsis
__m256i _mm512_maskz_extracti64x4_epi64 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vextracti64x4 ymm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
CASE imm8[7:0] of
0: dst[255:0] := a[255:0]
1: dst[255:0] := a[511:256]
ESAC
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqa32
void _mm512_extstore_epi32 (void * mt, __m512i v, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
Synopsis
void _mm512_extstore_epi32 (void * mt, __m512i v, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: KNCNI
Description
Downconverts packed 32-bit integer elements stored in v to a smaller type depending on conv and stores them in memory location mt. hint indicates to the processor whether the data is non-temporal.
Operation
addr := MEM[mt]
FOR j := 0 to 15
i := j*32
CASE conv OF
_MM_DOWNCONV_EPI32_NONE:
addr[i+31:i] := v[i+31:i]
_MM_DOWNCONV_EPI32_UINT8:
n := j*8
addr[n+7:n] := Int32ToUInt8(v[i+31:i])
_MM_DOWNCONV_EPI32_SINT8:
n := j*8
addr[n+7:n] := Int32ToSInt8(v[i+31:i])
_MM_DOWNCONV_EPI32_UINT16:
n := j*16
addr[n+15:n] := Int32ToUInt16(v[i+31:i])
_MM_DOWNCONV_EPI32_SINT16:
n := j*16
addr[n+15:n] := Int32ToSInt16(v[i+31:i])
ESAC
ENDFOR
vmovdqa32
void _mm512_mask_extstore_epi32 (void * mt, __mmask16 k, __m512i v, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
Synopsis
void _mm512_mask_extstore_epi32 (void * mt, __mmask16 k, __m512i v, _MM_DOWNCONV_EPI32_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: KNCNI
Description
Downconverts packed 32-bit integer elements stored in v to a smaller type depending on conv and stores them in memory location mt (elements in mt are unaltered when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
addr := MEM[mt]
FOR j := 0 to 15
i := j*32
IF k[j]
CASE conv OF
_MM_DOWNCONV_EPI32_NONE:
addr[i+31:i] := v[i+31:i]
_MM_DOWNCONV_EPI32_UINT8:
n := j*8
addr[n+7:n] := Int32ToUInt8(v[i+31:i])
_MM_DOWNCONV_EPI32_SINT8:
n := j*8
addr[n+7:n] := Int32ToSInt8(v[i+31:i])
_MM_DOWNCONV_EPI32_UINT16:
n := j*16
addr[n+15:n] := Int32ToUInt16(v[i+31:i])
_MM_DOWNCONV_EPI32_SINT16:
n := j*16
addr[n+15:n] := Int32ToSInt16(v[i+31:i])
ESAC
FI
ENDFOR
vmovdqa64
void _mm512_extstore_epi64 (void * mt, __m512i v, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
Synopsis
void _mm512_extstore_epi64 (void * mt, __m512i v, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
CPUID Flags: KNCNI
Description
Downconverts packed 64-bit integer elements stored in v to a smaller type depending on conv and stores them in memory location mt. hint indicates to the processor whether the data is non-temporal.
Operation
addr := MEM[mt]
FOR j := 0 to 7
i := j*64
CASE conv OF
_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i]
ESAC
ENDFOR
vmovdqa64
void _mm512_mask_extstore_epi64 (void * mt, __mmask8 k, __m512i v, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
Synopsis
void _mm512_mask_extstore_epi64 (void * mt, __mmask8 k, __m512i v, _MM_DOWNCONV_EPI64_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
CPUID Flags: KNCNI
Description
Downconverts packed 64-bit integer elements stored in v to a smaller type depending on conv and stores them in memory location mt (elements in mt are unaltered when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
addr := MEM[mt]
FOR j := 0 to 7
i := j*64
IF k[j]
CASE conv OF
_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v[i+63:i]
ESAC
FI
ENDFOR
vmovapd
void _mm512_extstore_pd (void * mt, __m512d v, _MM_DOWNCONV_PD_ENUM conv, int hint)
Synopsis
void _mm512_extstore_pd (void * mt, __m512d v, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovapd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Downconverts packed double-precision (64-bit) floating-point elements stored in v to a smaller type depending on conv and stores them in memory location mt. hint indicates to the processor whether the data is non-temporal.
Operation
addr := MEM[mt]
FOR j := 0 to 7
i := j*64
CASE conv OF
_MM_DOWNCONV_PS_NONE:
addr[i+63:i] := v[i+63:i]
ESAC
ENDFOR
vmovapd
void _mm512_mask_extstore_pd (void * mt, __mmask8 k, __m512d v, _MM_DOWNCONV_PD_ENUM conv, int hint)
Synopsis
void _mm512_mask_extstore_pd (void * mt, __mmask8 k, __m512d v, _MM_DOWNCONV_PD_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovapd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Downconverts packed double-precision (64-bit) floating-point elements stored in v to a smaller type depending on conv and stores them in memory location mt (elements in mt are unaltered when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
addr := MEM[mt]
FOR j := 0 to 7
i := j*64
CASE conv OF
_MM_DOWNCONV_PD_NONE:
IF k[j]
mt[i+63:i] := v[i+63:i]
FI
ESAC
ENDFOR
vmovaps
void _mm512_extstore_ps (void * mt, __m512 v, _MM_DOWNCONV_PS_ENUM conv, int hint)
Synopsis
void _mm512_extstore_ps (void * mt, __m512 v, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovaps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Downconverts packed single-precision (32-bit) floating-point elements stored in v to a smaller type depending on conv and stores them in memory location mt. hint indicates to the processor whether the data is non-temporal.
Operation
addr := MEM[mt]
FOR j := 0 to 15
i := j*32
CASE conv OF
_MM_DOWNCONV_PS_NONE:
addr[i+31:i] := v[i+31:i]
_MM_DOWNCONV_PS_FLOAT16:
n := j*16
addr[n+15:n] := Float32ToFloat16(v[i+31:i])
_MM_DOWNCONV_PS_UINT8:
n := j*8
addr[n+7:n] := Float32ToUInt8(v[i+31:i])
_MM_DOWNCONV_PS_SINT8:
n := j*8
addr[n+7:n] := Float32ToSInt8(v[i+31:i])
_MM_DOWNCONV_PS_UINT16:
n := j*16
addr[n+15:n] := Float32ToUInt16(v[i+31:i])
_MM_DOWNCONV_PS_SINT16:
n := j*16
addr[n+15:n] := Float32ToSInt16(v[i+31:i])
ESAC
ENDFOR
vmovaps
void _mm512_mask_extstore_ps (void * mt, __mmask16 k, __m512 v, _MM_DOWNCONV_PS_ENUM conv, int hint)
Synopsis
void _mm512_mask_extstore_ps (void * mt, __mmask16 k, __m512 v, _MM_DOWNCONV_PS_ENUM conv, int hint)
#include "immintrin.h"
Instruction: vmovaps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Downconverts packed single-precision (32-bit) floating-point elements stored in v to a smaller type depending on conv and stores them in memory location mt using writemask k (elements are not written to memory when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
CASE conv OF
_MM_DOWNCONV_PS_NONE:
mt[i+31:i] := v[i+31:i]
_MM_DOWNCONV_PS_FLOAT16:
n := j*16
mt[n+15:n] := Float32ToFloat16(v[i+31:i])
_MM_DOWNCONV_PS_UINT8:
n := j*8
mt[n+7:n] := Float32ToUInt8(v[i+31:i])
_MM_DOWNCONV_PS_SINT8:
n := j*8
mt[n+7:n] := Float32ToSInt8(v[i+31:i])
_MM_DOWNCONV_PS_UINT16:
n := j*16
mt[n+15:n] := Float32ToUInt16(v[i+31:i])
_MM_DOWNCONV_PS_SINT16:
n := j*16
mt[n+15:n] := Float32ToSInt16(v[i+31:i])
ESAC
FI
ENDFOR
vfixupimmpd
__m128d _mm_fixupimm_pd (__m128d a, __m128d b, __m128i c, int imm8)
Synopsis
__m128d _mm_fixupimm_pd (__m128d a, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 1
i := j*64
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vfixupimmpd
__m128d _mm_mask_fixupimm_pd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8)
Synopsis
__m128d _mm_mask_fixupimm_pd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfixupimmpd
__m128d _mm_maskz_fixupimm_pd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8)
Synopsis
__m128d _mm_maskz_fixupimm_pd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfixupimmpd
__m256d _mm256_fixupimm_pd (__m256d a, __m256d b, __m256i c, int imm8)
Synopsis
__m256d _mm256_fixupimm_pd (__m256d a, __m256d b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 3
i := j*64
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vfixupimmpd
__m256d _mm256_mask_fixupimm_pd (__m256d a, __mmask8 k, __m256d b, __m256i c, int imm8)
Synopsis
__m256d _mm256_mask_fixupimm_pd (__m256d a, __mmask8 k, __m256d b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfixupimmpd
__m256d _mm256_maskz_fixupimm_pd (__mmask8 k, __m256d a, __m256d b, __m256i c, int imm8)
Synopsis
__m256d _mm256_maskz_fixupimm_pd (__mmask8 k, __m256d a, __m256d b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfixupimmpd
__m512d _mm512_fixupimm_pd (__m512d a, __m512d b, __m512i c, int imm8)
Synopsis
__m512d _mm512_fixupimm_pd (__m512d a, __m512d b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_mask_fixupimm_pd (__m512d a, __mmask8 k, __m512d b, __m512i c, int imm8)
Synopsis
__m512d _mm512_mask_fixupimm_pd (__m512d a, __mmask8 k, __m512d b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_maskz_fixupimm_pd (__mmask8 k, __m512d a, __m512d b, __m512i c, int imm8)
Synopsis
__m512d _mm512_maskz_fixupimm_pd (__mmask8 k, __m512d a, __m512d b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfixupimmps
__m128 _mm_fixupimm_ps (__m128 a, __m128 b, __m128i c, int imm8)
Synopsis
__m128 _mm_fixupimm_ps (__m128 a, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 3
i := j*32
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vfixupimmps
__m128 _mm_mask_fixupimm_ps (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8)
Synopsis
__m128 _mm_mask_fixupimm_ps (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfixupimmps
__m128 _mm_maskz_fixupimm_ps (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8)
Synopsis
__m128 _mm_maskz_fixupimm_ps (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfixupimmps
__m256 _mm256_fixupimm_ps (__m256 a, __m256 b, __m256i c, int imm8)
Synopsis
__m256 _mm256_fixupimm_ps (__m256 a, __m256 b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 7
i := j*32
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vfixupimmps
__m256 _mm256_mask_fixupimm_ps (__m256 a, __mmask8 k, __m256 b, __m256i c, int imm8)
Synopsis
__m256 _mm256_mask_fixupimm_ps (__m256 a, __mmask8 k, __m256 b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfixupimmps
__m256 _mm256_maskz_fixupimm_ps (__mmask8 k, __m256 a, __m256 b, __m256i c, int imm8)
Synopsis
__m256 _mm256_maskz_fixupimm_ps (__mmask8 k, __m256 a, __m256 b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps
CPUID Flags: AVX512VL + AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfixupimmps
__m512 _mm512_fixupimm_ps (__m512 a, __m512 b, __m512i c, int imm8)
Synopsis
__m512 _mm512_fixupimm_ps (__m512 a, __m512 b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_mask_fixupimm_ps (__m512 a, __mmask16 k, __m512 b, __m512i c, int imm8)
Synopsis
__m512 _mm512_mask_fixupimm_ps (__m512 a, __mmask16 k, __m512 b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_maskz_fixupimm_ps (__mmask16 k, __m512 a, __m512 b, __m512i c, int imm8)
Synopsis
__m512 _mm512_maskz_fixupimm_ps (__mmask16 k, __m512 a, __m512 b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_fixupimm_round_pd (__m512d a, __m512d b, __m512i c, int imm8, int rounding)
Synopsis
__m512d _mm512_fixupimm_round_pd (__m512d a, __m512d b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in
a and
b using packed 64-bit integers in
c, and store the results in
dst.
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_mask_fixupimm_round_pd (__m512d a, __mmask8 k, __m512d b, __m512i c, int imm8, int rounding)
Synopsis
__m512d _mm512_mask_fixupimm_round_pd (__m512d a, __mmask8 k, __m512d b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in
a and
b using packed 64-bit integers in
c, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfixupimmpd
__m512d _mm512_maskz_fixupimm_round_pd (__mmask8 k, __m512d a, __m512d b, __m512i c, int imm8, int rounding)
Synopsis
__m512d _mm512_maskz_fixupimm_round_pd (__mmask8 k, __m512d a, __m512d b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmpd zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up packed double-precision (64-bit) floating-point elements in
a and
b using packed 64-bit integers in
c, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := FIXUPIMMPD(a[i+63:i], b[i+63:i], c[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_fixupimm_round_ps (__m512 a, __m512 b, __m512i c, int imm8, int rounding)
Synopsis
__m512 _mm512_fixupimm_round_ps (__m512 a, __m512 b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in
a and
b using packed 32-bit integers in
c, and store the results in
dst.
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_mask_fixupimm_round_ps (__m512 a, __mmask16 k, __m512 b, __m512i c, int imm8, int rounding)
Synopsis
__m512 _mm512_mask_fixupimm_round_ps (__m512 a, __mmask16 k, __m512 b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in
a and
b using packed 32-bit integers in
c, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfixupimmps
__m512 _mm512_maskz_fixupimm_round_ps (__mmask16 k, __m512 a, __m512 b, __m512i c, int imm8, int rounding)
Synopsis
__m512 _mm512_maskz_fixupimm_round_ps (__mmask16 k, __m512 a, __m512 b, __m512i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmps zmm {k}, zmm, zmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up packed single-precision (32-bit) floating-point elements in
a and
b using packed 32-bit integers in
c, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FIXUPIMMPD(a[i+31:i], b[i+31:i], c[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfixupimmsd
__m128d _mm_fixupimm_round_sd (__m128d a, __m128d b, __m128i c, int imm8, int rounding)
Synopsis
__m128d _mm_fixupimm_round_sd (__m128d a, __m128d b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up the lower double-precision (64-bit) floating-point elements in
a and
b using the lower 64-bit integer in
c, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_mask_fixupimm_round_sd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8, int rounding)
Synopsis
__m128d _mm_mask_fixupimm_round_sd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up the lower double-precision (64-bit) floating-point elements in
a and
b using the lower 64-bit integer in
c, store the result in the lower element of
dst using writemask
k (the element is copied from
a when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
IF k[0]
dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_maskz_fixupimm_round_sd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8, int rounding)
Synopsis
__m128d _mm_maskz_fixupimm_round_sd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up the lower double-precision (64-bit) floating-point elements in
a and
b using the lower 64-bit integer in
c, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
IF k[0]
dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfixupimmss
__m128 _mm_fixupimm_round_ss (__m128 a, __m128 b, __m128i c, int imm8, int rounding)
Synopsis
__m128 _mm_fixupimm_round_ss (__m128 a, __m128 b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up the lower single-precision (32-bit) floating-point elements in
a and
b using the lower 32-bit integer in
c, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfixupimmss
__m128 _mm_mask_fixupimm_round_ss (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8, int rounding)
Synopsis
__m128 _mm_mask_fixupimm_round_ss (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up the lower single-precision (32-bit) floating-point elements in
a and
b using the lower 32-bit integer in
c, store the result in the lower element of
dst using writemask
k (the element is copied from
a when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
IF k[0]
dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfixupimmss
__m128 _mm_maskz_fixupimm_round_ss (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8, int rounding)
Synopsis
__m128 _mm_maskz_fixupimm_round_ss (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8, int rounding)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Fix up the lower single-precision (32-bit) floating-point elements in
a and
b using the lower 32-bit integer in
c, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
imm8 is used to set the required flags reporting.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
IF k[0]
dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_fixupimm_sd (__m128d a, __m128d b, __m128i c, int imm8)
Synopsis
__m128d _mm_fixupimm_sd (__m128d a, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_mask_fixupimm_sd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8)
Synopsis
__m128d _mm_mask_fixupimm_sd (__m128d a, __mmask8 k, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
IF k[0]
dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfixupimmsd
__m128d _mm_maskz_fixupimm_sd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8)
Synopsis
__m128d _mm_maskz_fixupimm_sd (__mmask8 k, __m128d a, __m128d b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN := 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[63:0], src2[63:0], src3[63:0], imm8[7:0]){
tsrc[63:0] := ((src2[62:52] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[63:0]
CASE(tsrc[63:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[63:0] := src1[63:0]
1 : dest[63:0] := tsrc[63:0]
2 : dest[63:0] := QNaN(tsrc[63:0])
3 : dest[63:0] := QNAN_Indefinite
4 : dest[63:0] := -INF
5 : dest[63:0] := +INF
6 : dest[63:0] := tsrc.sign? –INF : +INF
7 : dest[63:0] := -0
8 : dest[63:0] := +0
9 : dest[63:0] := -1
10: dest[63:0] := +1
11: dest[63:0] := 1⁄2
12: dest[63:0] := 90.0
13: dest[63:0] := PI/2
14: dest[63:0] := MAX_FLOAT
15: dest[63:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[63:0]
}
IF k[0]
dst[63:0] := FIXUPIMMPD(a[63:0], b[63:0], c[63:0], imm8[7:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfixupimmss
__m128 _mm_fixupimm_ss (__m128 a, __m128 b, __m128i c, int imm8)
Synopsis
__m128 _mm_fixupimm_ss (__m128 a, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfixupimmss
__m128 _mm_mask_fixupimm_ss (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8)
Synopsis
__m128 _mm_mask_fixupimm_ss (__m128 a, __mmask8 k, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
IF k[0]
dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfixupimmss
__m128 _mm_maskz_fixupimm_ss (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8)
Synopsis
__m128 _mm_maskz_fixupimm_ss (__mmask8 k, __m128 a, __m128 b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vfixupimmss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
Operation
enum TOKEN_TYPE {
QNAN_TOKEN := 0,
SNAN_TOKEN L= 1,
ZERO_VALUE_TOKEN := 2,
ONE_VALUE_TOKEN := 3,
NEG_INF_TOKEN := 4,
POS_INF_TOKEN := 5,
NEG_VALUE_TOKEN := 6,
POS_VALUE_TOKEN := 7
}
FIXUPIMMPD(src1[31:0], src2[31:0], src3[31:0], imm8[7:0]){
tsrc[31:0] := ((src2[30:23] == 0) AND (MXCSR.DAZ == 1)) ? 0.0 : src2[31:0]
CASE(tsrc[31:0] of TOKEN_TYPE)
QNAN_TOKEN:j := 0
SNAN_TOKEN:j := 1
ZERO_VALUE_TOKEN: j := 2
ONE_VALUE_TOKEN: j := 3
NEG_INF_TOKEN: j := 4
POS_INF_TOKEN: j := 5
NEG_VALUE_TOKEN: j := 6
POS_VALUE_TOKEN: j := 7
ESAC
token_response[3:0] := src3[3+4*j:4*j]
CASE(token_response[3:0]) of
0 : dest[31:0] := src1[31:0]
1 : dest[31:0] := tsrc[31:0]
2 : dest[31:0] := QNaN(tsrc[31:0])
3 : dest[31:0] := QNAN_Indefinite
4 : dest[31:0] := -INF
5 : dest[31:0] := +INF
6 : dest[31:0] := tsrc.sign? –INF : +INF
7 : dest[31:0] := -0
8 : dest[31:0] := +0
9 : dest[31:0] := -1
10: dest[31:0] := +1
11: dest[31:0] := 1⁄2
12: dest[31:0] := 90.0
13: dest[31:0] := PI/2
14: dest[31:0] := MAX_FLOAT
15: dest[31:0] := -MAX_FLOAT
ESAC
CASE(tsrc[31:0] of TOKEN_TYPE)
ZERO_VALUE_TOKEN: if imm8[0] then set #ZE
ZERO_VALUE_TOKEN: if imm8[1] then set #IE
ONE_VALUE_TOKEN: if imm8[2] then set #ZE
ONE_VALUE_TOKEN: if imm8[3] then set #IE
SNAN_TOKEN: if imm8[4] then set #IE
NEG_INF_TOKEN: if imm8[5] then set #IE
NEG_VALUE_TOKEN: if imm8[6] then set #IE
POS_INF_TOKEN: if imm8[7] then set #IE
ESAC
RETURN dest[31:0]
}
IF k[0]
dst[31:0] := FIXUPIMMPD(a[31:0], b[31:0], c[31:0], imm8[7:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfixupnanpd
__m512d _mm512_fixupnan_pd (__m512d v1, __m512d v2, __m512i v3)
Synopsis
__m512d _mm512_fixupnan_pd (__m512d v1, __m512d v2, __m512i v3)
#include "immintrin.h"
Instruction: vfixupnanpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Fixes up NaN's from packed double-precision (64-bit) floating-point elements in v1 and v2, storing the results in dst and storing the quietized NaN's from v1 in v3.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i])
v3[i+63:i] := QuietizeNaNs(v1[i+63:i])
ENDFOR
dst[MAX:512] := 0
vfixupnanpd
__m512d _mm512_mask_fixupnan_pd (__m512d v1, __mmask8 k, __m512d v2, __m512i v3)
Synopsis
__m512d _mm512_mask_fixupnan_pd (__m512d v1, __mmask8 k, __m512d v2, __m512i v3)
#include "immintrin.h"
Instruction: vfixupnanpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Fixes up NaN's from packed double-precision (64-bit) floating-point elements in v1 and v2, storing the results in dst using writemask k (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from v1 are stored in v3.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := FixupNaNs(v1[i+63:i], v2[i+63:i])
v3[i+63:i] := QuietizeNaNs(v1[i+63:i])
FI
ENDFOR
dst[MAX:512] := 0
vfixupnanps
__m512 _mm512_fixupnan_ps (__m512 v1, __m512 v2, __m512i v3)
Synopsis
__m512 _mm512_fixupnan_ps (__m512 v1, __m512 v2, __m512i v3)
#include "immintrin.h"
Instruction: vfixupnanps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Fixes up NaN's from packed single-precision (32-bit) floating-point elements in v1 and v2, storing the results in dst and storing the quietized NaN's from v1 in v3.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i])
v3[i+31:i] := QuietizeNaNs(v1[i+31:i])
ENDFOR
dst[MAX:512] := 0
vfixupnanps
__m512 _mm512_mask_fixupnan_ps (__m512 v1, __mmask16 k, __m512 v2, __m512i v3)
Synopsis
__m512 _mm512_mask_fixupnan_ps (__m512 v1, __mmask16 k, __m512 v2, __m512i v3)
#include "immintrin.h"
Instruction: vfixupnanps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Fixes up NaN's from packed single-precision (32-bit) floating-point elements in v1 and v2, storing the results in dst using writemask k (only elements whose corresponding mask bit is set are used in the computation). Quietized NaN's from v1 are stored in v3.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FixupNaNs(v1[i+31:i], v2[i+31:i])
v3[i+31:i] := QuietizeNaNs(v1[i+31:i])
FI
ENDFOR
dst[MAX:512] := 0
roundpd
__m128d _mm_floor_pd (__m128d a)
Synopsis
__m128d _mm_floor_pd (__m128d a)
#include "smmintrin.h"
Instruction: roundpd xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
Performance
vroundpd
__m256d _mm256_floor_pd (__m256d a)
Synopsis
__m256d _mm256_floor_pd (__m256d a)
#include "immintrin.h"
Instruction: vroundpd ymm, ymm, imm
CPUID Flags: AVX
Description
Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
...
__m512d _mm512_floor_pd (__m512d a)
Synopsis
__m512d _mm512_floor_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_floor_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_floor_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := FLOOR(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
roundps
__m128 _mm_floor_ps (__m128 a)
Synopsis
__m128 _mm_floor_ps (__m128 a)
#include "smmintrin.h"
Instruction: roundps xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
Performance
vroundps
__m256 _mm256_floor_ps (__m256 a)
Synopsis
__m256 _mm256_floor_ps (__m256 a)
#include "immintrin.h"
Instruction: vroundps ymm, ymm, imm
CPUID Flags: AVX
Description
Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
...
__m512 _mm512_floor_ps (__m512 a)
Synopsis
__m512 _mm512_floor_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_floor_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_floor_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FLOOR(a[i+31:i])
ELSE
dst[i+31:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
roundsd
__m128d _mm_floor_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_floor_sd (__m128d a, __m128d b)
#include "smmintrin.h"
Instruction: roundsd xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the lower double-precision (64-bit) floating-point element in b down to an integer value, store the result as a double-precision floating-point element in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := FLOOR(b[63:0])
dst[127:64] := a[127:64]
Performance
roundss
__m128 _mm_floor_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_floor_ss (__m128 a, __m128 b)
#include "smmintrin.h"
Instruction: roundss xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the lower single-precision (32-bit) floating-point element in b down to an integer value, store the result as a single-precision floating-point element in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := FLOOR(b[31:0])
dst[127:32] := a[127:32]
Performance
vpmadd231d
__m512i _mm512_fmadd_epi32 (__m512i a, __m512i b, __m512i c)
Synopsis
__m512i _mm512_fmadd_epi32 (__m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd231d zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Multiply packed 32-bit integer elements in a and b, add the intermediate result to packed elements in c and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:512] := 0
vpmadd231d
__m512i _mm512_mask_fmadd_epi32 (__m512i a, __mmask16 k, __m512i b, __m512i c)
Synopsis
__m512i _mm512_mask_fmadd_epi32 (__m512i a, __mmask16 k, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd231d zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Multiply packed 32-bit integer elements in a and b, add the intermediate result to packed elements in c and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmadd231d
__m512i _mm512_mask3_fmadd_epi32 (__m512i a, __m512i b, __m512i c, __mmask16 k)
Synopsis
__m512i _mm512_mask3_fmadd_epi32 (__m512i a, __m512i b, __m512i c, __mmask16 k)
#include "immintrin.h"
Instruction: vpmadd231d zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Multiply packed 32-bit integer elements in a and b, add the intermediate result to packed elements in c and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m128d _mm_fmadd_pd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fmadd_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132pd xmm, xmm, xmm
vfmadd213pd xmm, xmm, xmm
vfmadd231pd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:128] := 0
Performance
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m128d _mm_mask_fmadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fmadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132pd
vfmadd213pd
vfmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m128d _mm_mask3_fmadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fmadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132pd
vfmadd213pd
vfmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m128d _mm_maskz_fmadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fmadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132pd
vfmadd213pd
vfmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_fmadd_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmadd132pd ymm, ymm, ymm
vfmadd213pd ymm, ymm, ymm
vfmadd231pd ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m256d _mm256_mask_fmadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
Synopsis
__m256d _mm256_mask_fmadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmadd132pd
vfmadd213pd
vfmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m256d _mm256_mask3_fmadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
Synopsis
__m256d _mm256_mask3_fmadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132pd
vfmadd213pd
vfmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m256d _mm256_maskz_fmadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_maskz_fmadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmadd132pd
vfmadd213pd
vfmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_fmadd_pd (__m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_fmadd_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm
vfmadd213pd zmm {k}, zmm, zmm
vfmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_mask_fmadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
Synopsis
__m512d _mm512_mask_fmadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm
vfmadd213pd zmm {k}, zmm, zmm
vfmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_mask3_fmadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
Synopsis
__m512d _mm512_mask3_fmadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm
vfmadd213pd zmm {k}, zmm, zmm
vfmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_maskz_fmadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_maskz_fmadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm
vfmadd213pd zmm {k}, zmm, zmm
vfmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ps xmm, xmm, xmm
vfmadd213ps xmm, xmm, xmm
vfmadd231ps xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:128] := 0
Performance
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m128 _mm_mask_fmadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fmadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ps
vfmadd213ps
vfmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m128 _mm_mask3_fmadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fmadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132ps
vfmadd213ps
vfmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m128 _mm_maskz_fmadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fmadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ps
vfmadd213ps
vfmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m256 _mm256_fmadd_ps (__m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_fmadd_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmadd132ps ymm, ymm, ymm
vfmadd213ps ymm, ymm, ymm
vfmadd231ps ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m256 _mm256_mask_fmadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
Synopsis
__m256 _mm256_mask_fmadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmadd132ps
vfmadd213ps
vfmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m256 _mm256_mask3_fmadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
Synopsis
__m256 _mm256_mask3_fmadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132ps
vfmadd213ps
vfmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m256 _mm256_maskz_fmadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_maskz_fmadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmadd132ps
vfmadd213ps
vfmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_fmadd_ps (__m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_fmadd_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm
vfmadd213ps zmm {k}, zmm, zmm
vfmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_mask_fmadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
Synopsis
__m512 _mm512_mask_fmadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm
vfmadd213ps zmm {k}, zmm, zmm
vfmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_mask3_fmadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
Synopsis
__m512 _mm512_mask3_fmadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm
vfmadd213ps zmm {k}, zmm, zmm
vfmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_maskz_fmadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_maskz_fmadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm
vfmadd213ps zmm {k}, zmm, zmm
vfmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_fmadd_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
Synopsis
__m512d _mm512_fmadd_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm {er}
vfmadd213pd zmm {k}, zmm, zmm {er}
vfmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, add the intermediate result to packed elements in
c, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_mask_fmadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
Synopsis
__m512d _mm512_mask_fmadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm {er}
vfmadd213pd zmm {k}, zmm, zmm {er}
vfmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, add the intermediate result to packed elements in
c, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_mask3_fmadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
Synopsis
__m512d _mm512_mask3_fmadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm {er}
vfmadd213pd zmm {k}, zmm, zmm {er}
vfmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, add the intermediate result to packed elements in
c, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132pd, vfmadd213pd, vfmadd231pd
__m512d _mm512_maskz_fmadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_maskz_fmadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmadd132pd zmm {k}, zmm, zmm {er}
vfmadd213pd zmm {k}, zmm, zmm {er}
vfmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, add the intermediate result to packed elements in
c, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_fmadd_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
Synopsis
__m512 _mm512_fmadd_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm {er}
vfmadd213ps zmm {k}, zmm, zmm {er}
vfmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, add the intermediate result to packed elements in
c, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_mask_fmadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
Synopsis
__m512 _mm512_mask_fmadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm {er}
vfmadd213ps zmm {k}, zmm, zmm {er}
vfmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, add the intermediate result to packed elements in
c, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_mask3_fmadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
Synopsis
__m512 _mm512_mask3_fmadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm {er}
vfmadd213ps zmm {k}, zmm, zmm {er}
vfmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, add the intermediate result to packed elements in
c, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132ps, vfmadd213ps, vfmadd231ps
__m512 _mm512_maskz_fmadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_maskz_fmadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmadd132ps zmm {k}, zmm, zmm {er}
vfmadd213ps zmm {k}, zmm, zmm {er}
vfmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, add the intermediate result to packed elements in
c, and store the results in
a using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
a[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_mask_fmadd_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
Synopsis
__m128d _mm_mask_fmadd_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm {er}
vfmadd213sd xmm {k}, xmm, xmm {er}
vfmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and add the intermediate result to the lower element in
c. Store the result in the lower element of
dst using writemask
k (the element is copied from
a when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_mask3_fmadd_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
Synopsis
__m128d _mm_mask3_fmadd_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm {er}
vfmadd213sd xmm {k}, xmm, xmm {er}
vfmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and add the intermediate result to the lower element in
c. Store the result in the lower element of
dst using writemask
k (the element is copied from
c when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_maskz_fmadd_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
Synopsis
__m128d _mm_maskz_fmadd_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm {er}
vfmadd213sd xmm {k}, xmm, xmm {er}
vfmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and add the intermediate result to the lower element in
c. Store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_mask_fmadd_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
Synopsis
__m128 _mm_mask_fmadd_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm {er}
vfmadd213ss xmm {k}, xmm, xmm {er}
vfmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and add the intermediate result to the lower element in
c. Store the result in the lower element of
dst using writemask
k (the element is copied from
a when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_mask3_fmadd_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
Synopsis
__m128 _mm_mask3_fmadd_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm {er}
vfmadd213ss xmm {k}, xmm, xmm {er}
vfmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and add the intermediate result to the lower element in
c. Store the result in the lower element of
dst using writemask
k (the element is copied from
c when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_maskz_fmadd_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
Synopsis
__m128 _mm_maskz_fmadd_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm {er}
vfmadd213ss xmm {k}, xmm, xmm {er}
vfmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and add the intermediate result to the lower element in
c. Store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_fmadd_sd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fmadd_sd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132sd xmm, xmm, xmm
vfmadd213sd xmm, xmm, xmm
vfmadd231sd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_mask_fmadd_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fmadd_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm
vfmadd213sd xmm {k}, xmm, xmm
vfmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_mask3_fmadd_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fmadd_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm
vfmadd213sd xmm {k}, xmm, xmm
vfmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmadd132sd, vfmadd213sd, vfmadd231sd
__m128d _mm_maskz_fmadd_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fmadd_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmadd132sd xmm {k}, xmm, xmm
vfmadd213sd xmm {k}, xmm, xmm
vfmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_fmadd_ss (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fmadd_ss (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ss xmm, xmm, xmm
vfmadd213ss xmm, xmm, xmm
vfmadd231ss xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
Performance
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_mask_fmadd_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fmadd_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm
vfmadd213ss xmm {k}, xmm, xmm
vfmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_mask3_fmadd_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fmadd_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm
vfmadd213ss xmm {k}, xmm, xmm
vfmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmadd132ss, vfmadd213ss, vfmadd231ss
__m128 _mm_maskz_fmadd_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fmadd_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmadd132ss xmm {k}, xmm, xmm
vfmadd213ss xmm {k}, xmm, xmm
vfmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vpmadd233d
__m512i _mm512_fmadd233_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_fmadd233_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmadd233d zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Multiply packed 32-bit integer elements in each 4-element set of a and by element 1 of the corresponding 4-element set from b, add the intermediate result to element 0 of the corresponding 4-element set from b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
base := (j & ~0x3) * 32
scale[31:0] := b[base+63:base+32]
bias[31:0] := b[base+31:base]
dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
ENDFOR
dst[MAX:512] := 0
vpmadd233d
__m512i _mm512_mask_fmadd233_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_fmadd233_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmadd233d zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Multiply packed 32-bit integer elements in each 4-element set of a and by element 1 of the corresponding 4-element set from b, add the intermediate result to element 0 of the corresponding 4-element set from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
base := (j & ~0x3) * 32
scale[31:0] := b[base+63:base+32]
bias[31:0] := b[base+31:base]
dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd233ps
__m512 _mm512_fmadd233_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_fmadd233_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vfmadd233ps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Performs multiplication between single-precision (32-bit) floating-point elements in a and b and adds the result to the elements in b, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vfmadd233ps
__m512 _mm512_mask_fmadd233_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_fmadd233_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vfmadd233ps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Performs multiplication between single-precision (32-bit) floating-point elements in a and b and adds the result to the elements in b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmadd233ps
__m512 _mm512_fmadd233_round_ps (__m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_fmadd233_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vfmadd233ps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of
a and by element 1 of the corresponding 4-element set from
b, add the intermediate result to element 0 of the corresponding 4-element set from
b, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
base := (j & ~0x3) * 32
scale[31:0] := b[base+63:base+32]
bias[31:0] := b[base+31:base]
dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
ENDFOR
dst[MAX:512] := 0
vfmadd233ps
__m512 _mm512_mask_fmadd233_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_mask_fmadd233_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vfmadd233ps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Multiply packed single-precision (32-bit) floating-point elements in each 4-element set of
a and by element 1 of the corresponding 4-element set from
b, add the intermediate result to element 0 of the corresponding 4-element set from
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
base := (j & ~0x3) * 32
scale[31:0] := b[base+63:base+32]
bias[31:0] := b[base+31:base]
dst[i+31:i] := (a[i+31:i] * scale[31:0]) + bias[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m128d _mm_fmaddsub_pd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fmaddsub_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd xmm, xmm, xmm
vfmaddsub213pd xmm, xmm, xmm
vfmaddsub231pd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
Performance
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m128d _mm_mask_fmaddsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fmaddsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd
vfmaddsub213pd
vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m128d _mm_mask3_fmaddsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fmaddsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132pd
vfmaddsub213pd
vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m128d _mm_maskz_fmaddsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fmaddsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd
vfmaddsub213pd
vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m256d _mm256_fmaddsub_pd (__m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_fmaddsub_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd ymm, ymm, ymm
vfmaddsub213pd ymm, ymm, ymm
vfmaddsub231pd ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m256d _mm256_mask_fmaddsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
Synopsis
__m256d _mm256_mask_fmaddsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd
vfmaddsub213pd
vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m256d _mm256_mask3_fmaddsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
Synopsis
__m256d _mm256_mask3_fmaddsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132pd
vfmaddsub213pd
vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m256d _mm256_maskz_fmaddsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_maskz_fmaddsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd
vfmaddsub213pd
vfmaddsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_fmaddsub_pd (__m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_fmaddsub_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm
vfmaddsub213pd zmm {k}, zmm, zmm
vfmaddsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_mask_fmaddsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
Synopsis
__m512d _mm512_mask_fmaddsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm
vfmaddsub213pd zmm {k}, zmm, zmm
vfmaddsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_mask3_fmaddsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
Synopsis
__m512d _mm512_mask3_fmaddsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm
vfmaddsub213pd zmm {k}, zmm, zmm
vfmaddsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_maskz_fmaddsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_maskz_fmaddsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm
vfmaddsub213pd zmm {k}, zmm, zmm
vfmaddsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m128 _mm_fmaddsub_ps (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fmaddsub_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps xmm, xmm, xmm
vfmaddsub213ps xmm, xmm, xmm
vfmaddsub231ps xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
Performance
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m128 _mm_mask_fmaddsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fmaddsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps
vfmaddsub213ps
vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m128 _mm_mask3_fmaddsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fmaddsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132ps
vfmaddsub213ps
vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m128 _mm_maskz_fmaddsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fmaddsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps
vfmaddsub213ps
vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m256 _mm256_fmaddsub_ps (__m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_fmaddsub_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps ymm, ymm, ymm
vfmaddsub213ps ymm, ymm, ymm
vfmaddsub231ps ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m256 _mm256_mask_fmaddsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
Synopsis
__m256 _mm256_mask_fmaddsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps
vfmaddsub213ps
vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m256 _mm256_mask3_fmaddsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
Synopsis
__m256 _mm256_mask3_fmaddsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmaddsub132ps
vfmaddsub213ps
vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m256 _mm256_maskz_fmaddsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_maskz_fmaddsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps
vfmaddsub213ps
vfmaddsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_fmaddsub_ps (__m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_fmaddsub_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm
vfmaddsub213ps zmm {k}, zmm, zmm
vfmaddsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_mask_fmaddsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
Synopsis
__m512 _mm512_mask_fmaddsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm
vfmaddsub213ps zmm {k}, zmm, zmm
vfmaddsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_mask3_fmaddsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
Synopsis
__m512 _mm512_mask3_fmaddsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm
vfmaddsub213ps zmm {k}, zmm, zmm
vfmaddsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_maskz_fmaddsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_maskz_fmaddsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm
vfmaddsub213ps zmm {k}, zmm, zmm
vfmaddsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_fmaddsub_round_pd (__m512d a, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_fmaddsub_round_pd (__m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm {er}
vfmaddsub213pd zmm {k}, zmm, zmm {er}
vfmaddsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, alternatively add and subtract packed elements in
c to/from the intermediate result, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_mask_fmaddsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_mask_fmaddsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm {er}
vfmaddsub213pd zmm {k}, zmm, zmm {er}
vfmaddsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, alternatively add and subtract packed elements in
c to/from the intermediate result, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_mask3_fmaddsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, const int rounding)
Synopsis
__m512d _mm512_mask3_fmaddsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm {er}
vfmaddsub213pd zmm {k}, zmm, zmm {er}
vfmaddsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, alternatively add and subtract packed elements in
c to/from the intermediate result, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132pd, vfmaddsub213pd, vfmaddsub231pd
__m512d _mm512_maskz_fmaddsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_maskz_fmaddsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132pd zmm {k}, zmm, zmm {er}
vfmaddsub213pd zmm {k}, zmm, zmm {er}
vfmaddsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, alternatively add and subtract packed elements in
c to/from the intermediate result, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_fmaddsub_round_ps (__m512 a, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_fmaddsub_round_ps (__m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm {er}
vfmaddsub213ps zmm {k}, zmm, zmm {er}
vfmaddsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, alternatively add and subtract packed elements in
c to/from the intermediate result, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_mask_fmaddsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_mask_fmaddsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm {er}
vfmaddsub213ps zmm {k}, zmm, zmm {er}
vfmaddsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, alternatively add and subtract packed elements in
c to/from the intermediate result, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_mask3_fmaddsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, const int rounding)
Synopsis
__m512 _mm512_mask3_fmaddsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm {er}
vfmaddsub213ps zmm {k}, zmm, zmm {er}
vfmaddsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, alternatively add and subtract packed elements in
c to/from the intermediate result, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmaddsub132ps, vfmaddsub213ps, vfmaddsub231ps
__m512 _mm512_maskz_fmaddsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_maskz_fmaddsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmaddsub132ps zmm {k}, zmm, zmm {er}
vfmaddsub213ps zmm {k}, zmm, zmm {er}
vfmaddsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, alternatively add and subtract packed elements in
c to/from the intermediate result, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m128d _mm_fmsub_pd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fmsub_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132pd xmm, xmm, xmm
vfmsub213pd xmm, xmm, xmm
vfmsub231pd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:128] := 0
Performance
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m128d _mm_mask_fmsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fmsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132pd
vfmsub213pd
vfmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m128d _mm_mask3_fmsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fmsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132pd
vfmsub213pd
vfmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m128d _mm_maskz_fmsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fmsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132pd
vfmsub213pd
vfmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m256d _mm256_fmsub_pd (__m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_fmsub_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsub132pd ymm, ymm, ymm
vfmsub213pd ymm, ymm, ymm
vfmsub231pd ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m256d _mm256_mask_fmsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
Synopsis
__m256d _mm256_mask_fmsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsub132pd
vfmsub213pd
vfmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m256d _mm256_mask3_fmsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
Synopsis
__m256d _mm256_mask3_fmsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132pd
vfmsub213pd
vfmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m256d _mm256_maskz_fmsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_maskz_fmsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsub132pd
vfmsub213pd
vfmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_fmsub_pd (__m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_fmsub_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm
vfmsub213pd zmm {k}, zmm, zmm
vfmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_mask_fmsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
Synopsis
__m512d _mm512_mask_fmsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm
vfmsub213pd zmm {k}, zmm, zmm
vfmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_mask3_fmsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
Synopsis
__m512d _mm512_mask3_fmsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm
vfmsub213pd zmm {k}, zmm, zmm
vfmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_maskz_fmsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_maskz_fmsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm
vfmsub213pd zmm {k}, zmm, zmm
vfmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ps xmm, xmm, xmm
vfmsub213ps xmm, xmm, xmm
vfmsub231ps xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:128] := 0
Performance
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m128 _mm_mask_fmsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fmsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ps
vfmsub213ps
vfmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m128 _mm_mask3_fmsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fmsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132ps
vfmsub213ps
vfmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m128 _mm_maskz_fmsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fmsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ps
vfmsub213ps
vfmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m256 _mm256_fmsub_ps (__m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_fmsub_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsub132ps ymm, ymm, ymm
vfmsub213ps ymm, ymm, ymm
vfmsub231ps ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m256 _mm256_mask_fmsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
Synopsis
__m256 _mm256_mask_fmsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsub132ps
vfmsub213ps
vfmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m256 _mm256_mask3_fmsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
Synopsis
__m256 _mm256_mask3_fmsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132ps
vfmsub213ps
vfmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m256 _mm256_maskz_fmsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_maskz_fmsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsub132ps
vfmsub213ps
vfmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_fmsub_ps (__m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_fmsub_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm
vfmsub213ps zmm {k}, zmm, zmm
vfmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_mask_fmsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
Synopsis
__m512 _mm512_mask_fmsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm
vfmsub213ps zmm {k}, zmm, zmm
vfmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_mask3_fmsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
Synopsis
__m512 _mm512_mask3_fmsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm
vfmsub213ps zmm {k}, zmm, zmm
vfmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_maskz_fmsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_maskz_fmsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm
vfmsub213ps zmm {k}, zmm, zmm
vfmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_fmsub_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
Synopsis
__m512d _mm512_fmsub_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm {er}
vfmsub213pd zmm {k}, zmm, zmm {er}
vfmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, subtract packed elements in
c from the intermediate result, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_mask_fmsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
Synopsis
__m512d _mm512_mask_fmsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm {er}
vfmsub213pd zmm {k}, zmm, zmm {er}
vfmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, subtract packed elements in
c from the intermediate result, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_mask3_fmsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
Synopsis
__m512d _mm512_mask3_fmsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm {er}
vfmsub213pd zmm {k}, zmm, zmm {er}
vfmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, subtract packed elements in
c from the intermediate result, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132pd, vfmsub213pd, vfmsub231pd
__m512d _mm512_maskz_fmsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_maskz_fmsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmsub132pd zmm {k}, zmm, zmm {er}
vfmsub213pd zmm {k}, zmm, zmm {er}
vfmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, subtract packed elements in
c from the intermediate result, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_fmsub_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
Synopsis
__m512 _mm512_fmsub_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm {er}
vfmsub213ps zmm {k}, zmm, zmm {er}
vfmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, subtract packed elements in
c from the intermediate result, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_mask_fmsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
Synopsis
__m512 _mm512_mask_fmsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm {er}
vfmsub213ps zmm {k}, zmm, zmm {er}
vfmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, subtract packed elements in
c from the intermediate result, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_mask3_fmsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
Synopsis
__m512 _mm512_mask3_fmsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm {er}
vfmsub213ps zmm {k}, zmm, zmm {er}
vfmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, subtract packed elements in
c from the intermediate result, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132ps, vfmsub213ps, vfmsub231ps
__m512 _mm512_maskz_fmsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_maskz_fmsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmsub132ps zmm {k}, zmm, zmm {er}
vfmsub213ps zmm {k}, zmm, zmm {er}
vfmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, subtract packed elements in
c from the intermediate result, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_mask_fmsub_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
Synopsis
__m128d _mm_mask_fmsub_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm {er}
vfmsub213sd xmm {k}, xmm, xmm {er}
vfmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the intermediate result. Store the result in the lower element of
dst using writemask
k (the element is copied from
a when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_mask3_fmsub_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
Synopsis
__m128d _mm_mask3_fmsub_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm {er}
vfmsub213sd xmm {k}, xmm, xmm {er}
vfmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the intermediate result. Store the result in the lower element of
dst using writemask
k (the element is copied from
c when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_maskz_fmsub_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
Synopsis
__m128d _mm_maskz_fmsub_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm {er}
vfmsub213sd xmm {k}, xmm, xmm {er}
vfmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the intermediate result. Store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_mask_fmsub_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
Synopsis
__m128 _mm_mask_fmsub_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm {er}
vfmsub213ss xmm {k}, xmm, xmm {er}
vfmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the intermediate result. Store the result in the lower element of
dst using writemask
k (the element is copied from
a when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_mask3_fmsub_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
Synopsis
__m128 _mm_mask3_fmsub_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm {er}
vfmsub213ss xmm {k}, xmm, xmm {er}
vfmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the intermediate result. Store the result in the lower element of
dst using writemask
k (the element is copied from
c when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_maskz_fmsub_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
Synopsis
__m128 _mm_maskz_fmsub_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm {er}
vfmsub213ss xmm {k}, xmm, xmm {er}
vfmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the intermediate result. Store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_fmsub_sd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fmsub_sd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132sd xmm, xmm, xmm
vfmsub213sd xmm, xmm, xmm
vfmsub231sd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_mask_fmsub_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fmsub_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm
vfmsub213sd xmm {k}, xmm, xmm
vfmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_mask3_fmsub_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fmsub_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm
vfmsub213sd xmm {k}, xmm, xmm
vfmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmsub132sd, vfmsub213sd, vfmsub231sd
__m128d _mm_maskz_fmsub_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fmsub_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsub132sd xmm {k}, xmm, xmm
vfmsub213sd xmm {k}, xmm, xmm
vfmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := (a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_fmsub_ss (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fmsub_ss (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ss xmm, xmm, xmm
vfmsub213ss xmm, xmm, xmm
vfmsub231ss xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
Performance
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_mask_fmsub_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fmsub_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm
vfmsub213ss xmm {k}, xmm, xmm
vfmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_mask3_fmsub_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fmsub_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm
vfmsub213ss xmm {k}, xmm, xmm
vfmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmsub132ss, vfmsub213ss, vfmsub231ss
__m128 _mm_maskz_fmsub_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fmsub_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsub132ss xmm {k}, xmm, xmm
vfmsub213ss xmm {k}, xmm, xmm
vfmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := (a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m128d _mm_fmsubadd_pd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fmsubadd_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd xmm, xmm, xmm
vfmsubadd213pd xmm, xmm, xmm
vfmsubadd231pd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
Performance
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m128d _mm_mask_fmsubadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fmsubadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd
vfmsubadd213pd
vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m128d _mm_mask3_fmsubadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fmsubadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132pd
vfmsubadd213pd
vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m128d _mm_maskz_fmsubadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fmsubadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd
vfmsubadd213pd
vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m256d _mm256_fmsubadd_pd (__m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_fmsubadd_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd ymm, ymm, ymm
vfmsubadd213pd ymm, ymm, ymm
vfmsubadd231pd ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m256d _mm256_mask_fmsubadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
Synopsis
__m256d _mm256_mask_fmsubadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd
vfmsubadd213pd
vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m256d _mm256_mask3_fmsubadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
Synopsis
__m256d _mm256_mask3_fmsubadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132pd
vfmsubadd213pd
vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m256d _mm256_maskz_fmsubadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_maskz_fmsubadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd
vfmsubadd213pd
vfmsubadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_fmsubadd_pd (__m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_fmsubadd_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm
vfmsubadd213pd zmm {k}, zmm, zmm
vfmsubadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_mask_fmsubadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
Synopsis
__m512d _mm512_mask_fmsubadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm
vfmsubadd213pd zmm {k}, zmm, zmm
vfmsubadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_mask3_fmsubadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
Synopsis
__m512d _mm512_mask3_fmsubadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm
vfmsubadd213pd zmm {k}, zmm, zmm
vfmsubadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_maskz_fmsubadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_maskz_fmsubadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm
vfmsubadd213pd zmm {k}, zmm, zmm
vfmsubadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m128 _mm_fmsubadd_ps (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fmsubadd_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps xmm, xmm, xmm
vfmsubadd213ps xmm, xmm, xmm
vfmsubadd231ps xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
Performance
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m128 _mm_mask_fmsubadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fmsubadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps
vfmsubadd213ps
vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m128 _mm_mask3_fmsubadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fmsubadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132ps
vfmsubadd213ps
vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m128 _mm_maskz_fmsubadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fmsubadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps
vfmsubadd213ps
vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m256 _mm256_fmsubadd_ps (__m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_fmsubadd_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps ymm, ymm, ymm
vfmsubadd213ps ymm, ymm, ymm
vfmsubadd231ps ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m256 _mm256_mask_fmsubadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
Synopsis
__m256 _mm256_mask_fmsubadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps
vfmsubadd213ps
vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m256 _mm256_mask3_fmsubadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
Synopsis
__m256 _mm256_mask3_fmsubadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfmsubadd132ps
vfmsubadd213ps
vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m256 _mm256_maskz_fmsubadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_maskz_fmsubadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps
vfmsubadd213ps
vfmsubadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_fmsubadd_ps (__m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_fmsubadd_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm
vfmsubadd213ps zmm {k}, zmm, zmm
vfmsubadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_mask_fmsubadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
Synopsis
__m512 _mm512_mask_fmsubadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm
vfmsubadd213ps zmm {k}, zmm, zmm
vfmsubadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_mask3_fmsubadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
Synopsis
__m512 _mm512_mask3_fmsubadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm
vfmsubadd213ps zmm {k}, zmm, zmm
vfmsubadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_maskz_fmsubadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_maskz_fmsubadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm
vfmsubadd213ps zmm {k}, zmm, zmm
vfmsubadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_fmsubadd_round_pd (__m512d a, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_fmsubadd_round_pd (__m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm {er}
vfmsubadd213pd zmm {k}, zmm, zmm {er}
vfmsubadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, alternatively subtract and add packed elements in
c from/to the intermediate result, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_mask_fmsubadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_mask_fmsubadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm {er}
vfmsubadd213pd zmm {k}, zmm, zmm {er}
vfmsubadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, alternatively subtract and add packed elements in
c from/to the intermediate result, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_mask3_fmsubadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, const int rounding)
Synopsis
__m512d _mm512_mask3_fmsubadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm {er}
vfmsubadd213pd zmm {k}, zmm, zmm {er}
vfmsubadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, alternatively subtract and add packed elements in
c from/to the intermediate result, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132pd, vfmsubadd213pd, vfmsubadd231pd
__m512d _mm512_maskz_fmsubadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_maskz_fmsubadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132pd zmm {k}, zmm, zmm {er}
vfmsubadd213pd zmm {k}, zmm, zmm {er}
vfmsubadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, alternatively subtract and add packed elements in
c from/to the intermediate result, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF (j is even)
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := (a[i+63:i] * b[i+63:i]) - c[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_fmsubadd_round_ps (__m512 a, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_fmsubadd_round_ps (__m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm {er}
vfmsubadd213ps zmm {k}, zmm, zmm {er}
vfmsubadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, alternatively subtract and add packed elements in
c from/to the intermediate result, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_mask_fmsubadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_mask_fmsubadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm {er}
vfmsubadd213ps zmm {k}, zmm, zmm {er}
vfmsubadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, alternatively subtract and add packed elements in
c from/to the intermediate result, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_mask3_fmsubadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, const int rounding)
Synopsis
__m512 _mm512_mask3_fmsubadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm {er}
vfmsubadd213ps zmm {k}, zmm, zmm {er}
vfmsubadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, alternatively subtract and add packed elements in
c from/to the intermediate result, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfmsubadd132ps, vfmsubadd213ps, vfmsubadd231ps
__m512 _mm512_maskz_fmsubadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_maskz_fmsubadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfmsubadd132ps zmm {k}, zmm, zmm {er}
vfmsubadd213ps zmm {k}, zmm, zmm {er}
vfmsubadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, alternatively subtract and add packed elements in
c from/to the intermediate result, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF (j is even)
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) - c[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m128d _mm_fnmadd_pd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fnmadd_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132pd xmm, xmm, xmm
vfnmadd213pd xmm, xmm, xmm
vfnmadd231pd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:128] := 0
Performance
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m128d _mm_mask_fnmadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fnmadd_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132pd
vfnmadd213pd
vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m128d _mm_mask3_fnmadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fnmadd_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132pd
vfnmadd213pd
vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m128d _mm_maskz_fnmadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fnmadd_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132pd
vfnmadd213pd
vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m256d _mm256_fnmadd_pd (__m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_fnmadd_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmadd132pd ymm, ymm, ymm
vfnmadd213pd ymm, ymm, ymm
vfnmadd231pd ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m256d _mm256_mask_fnmadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
Synopsis
__m256d _mm256_mask_fnmadd_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmadd132pd
vfnmadd213pd
vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m256d _mm256_mask3_fnmadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
Synopsis
__m256d _mm256_mask3_fnmadd_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132pd
vfnmadd213pd
vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m256d _mm256_maskz_fnmadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_maskz_fnmadd_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmadd132pd
vfnmadd213pd
vfnmadd231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_fnmadd_pd (__m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_fnmadd_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm
vfnmadd213pd zmm {k}, zmm, zmm
vfnmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_mask_fnmadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
Synopsis
__m512d _mm512_mask_fnmadd_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm
vfnmadd213pd zmm {k}, zmm, zmm
vfnmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_mask3_fnmadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
Synopsis
__m512d _mm512_mask3_fnmadd_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm
vfnmadd213pd zmm {k}, zmm, zmm
vfnmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_maskz_fnmadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_maskz_fnmadd_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm
vfnmadd213pd zmm {k}, zmm, zmm
vfnmadd231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fnmadd_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ps xmm, xmm, xmm
vfnmadd213ps xmm, xmm, xmm
vfnmadd231ps xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:128] := 0
Performance
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m128 _mm_mask_fnmadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fnmadd_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ps
vfnmadd213ps
vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m128 _mm_mask3_fnmadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fnmadd_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132ps
vfnmadd213ps
vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m128 _mm_maskz_fnmadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fnmadd_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ps
vfnmadd213ps
vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m256 _mm256_fnmadd_ps (__m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_fnmadd_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmadd132ps ymm, ymm, ymm
vfnmadd213ps ymm, ymm, ymm
vfnmadd231ps ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m256 _mm256_mask_fnmadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
Synopsis
__m256 _mm256_mask_fnmadd_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmadd132ps
vfnmadd213ps
vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m256 _mm256_mask3_fnmadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
Synopsis
__m256 _mm256_mask3_fnmadd_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132ps
vfnmadd213ps
vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m256 _mm256_maskz_fnmadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_maskz_fnmadd_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmadd132ps
vfnmadd213ps
vfnmadd231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_fnmadd_ps (__m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_fnmadd_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm
vfnmadd213ps zmm {k}, zmm, zmm
vfnmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
a[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_mask_fnmadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
Synopsis
__m512 _mm512_mask_fnmadd_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm
vfnmadd213ps zmm {k}, zmm, zmm
vfnmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_mask3_fnmadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
Synopsis
__m512 _mm512_mask3_fnmadd_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm
vfnmadd213ps zmm {k}, zmm, zmm
vfnmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_maskz_fnmadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_maskz_fnmadd_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm
vfnmadd213ps zmm {k}, zmm, zmm
vfnmadd231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_fnmadd_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
Synopsis
__m512d _mm512_fnmadd_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm {er}
vfnmadd213pd zmm {k}, zmm, zmm {er}
vfnmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, add the negated intermediate result to packed elements in
c, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ENDFOR
dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_mask_fnmadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
Synopsis
__m512d _mm512_mask_fnmadd_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm {er}
vfnmadd213pd zmm {k}, zmm, zmm {er}
vfnmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, add the negated intermediate result to packed elements in
c, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_mask3_fnmadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
Synopsis
__m512d _mm512_mask3_fnmadd_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm {er}
vfnmadd213pd zmm {k}, zmm, zmm {er}
vfnmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, add the negated intermediate result to packed elements in
c, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132pd, vfnmadd213pd, vfnmadd231pd
__m512d _mm512_maskz_fnmadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_maskz_fnmadd_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfnmadd132pd zmm {k}, zmm, zmm {er}
vfnmadd213pd zmm {k}, zmm, zmm {er}
vfnmadd231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, add the negated intermediate result to packed elements in
c, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) + c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_fnmadd_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
Synopsis
__m512 _mm512_fnmadd_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm {er}
vfnmadd213ps zmm {k}, zmm, zmm {er}
vfnmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, add the negated intermediate result to packed elements in
c, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ENDFOR
dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_mask_fnmadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
Synopsis
__m512 _mm512_mask_fnmadd_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm {er}
vfnmadd213ps zmm {k}, zmm, zmm {er}
vfnmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, add the negated intermediate result to packed elements in
c, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_mask3_fnmadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
Synopsis
__m512 _mm512_mask3_fnmadd_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm {er}
vfnmadd213ps zmm {k}, zmm, zmm {er}
vfnmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, add the negated intermediate result to packed elements in
c, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132ps, vfnmadd213ps, vfnmadd231ps
__m512 _mm512_maskz_fnmadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_maskz_fnmadd_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ps zmm {k}, zmm, zmm {er}
vfnmadd213ps zmm {k}, zmm, zmm {er}
vfnmadd231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, add the negated intermediate result to packed elements in
c, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) + c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_mask_fnmadd_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
Synopsis
__m128d _mm_mask_fnmadd_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm {er}
vfnmadd213sd xmm {k}, xmm, xmm {er}
vfnmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and add the negated intermediate result to the lower element in
c. Store the result in the lower element of
dst using writemask
k (the element is copied from
a when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_mask3_fnmadd_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
Synopsis
__m128d _mm_mask3_fnmadd_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm {er}
vfnmadd213sd xmm {k}, xmm, xmm {er}
vfnmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and add the negated intermediate result to the lower element in
c. Store the result in the lower element of
dst using writemask
k (the element is copied from
c when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_maskz_fnmadd_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
Synopsis
__m128d _mm_maskz_fnmadd_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm {er}
vfnmadd213sd xmm {k}, xmm, xmm {er}
vfnmadd231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and add the negated intermediate result to the lower element in
c. Store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_mask_fnmadd_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
Synopsis
__m128 _mm_mask_fnmadd_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm {er}
vfnmadd213ss xmm {k}, xmm, xmm {er}
vfnmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and add the negated intermediate result to the lower element in
c. Store the result in the lower element of
dst using writemask
k (the element is copied from
a when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_mask3_fnmadd_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
Synopsis
__m128 _mm_mask3_fnmadd_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm {er}
vfnmadd213ss xmm {k}, xmm, xmm {er}
vfnmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and add the negated intermediate result to the lower element in
c. Store the result in the lower element of
dst using writemask
k (the element is copied from
c when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_maskz_fnmadd_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
Synopsis
__m128 _mm_maskz_fnmadd_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm {er}
vfnmadd213ss xmm {k}, xmm, xmm {er}
vfnmadd231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and add the negated intermediate result to the lower element in
c. Store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_fnmadd_sd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fnmadd_sd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm, xmm, xmm
vfnmadd213sd xmm, xmm, xmm
vfnmadd231sd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_mask_fnmadd_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fnmadd_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm
vfnmadd213sd xmm {k}, xmm, xmm
vfnmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmadd132sd, vfnmadd213sd, vfnmadd231sd
__m128d _mm_mask3_fnmadd_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fnmadd_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132sd xmm {k}, xmm, xmm
vfnmadd213sd xmm {k}, xmm, xmm
vfnmadd231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmadd213sd, vfnmadd231sd, vfnmadd132sd
__m128d _mm_maskz_fnmadd_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fnmadd_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmadd213sd xmm {k}, xmm, xmm
vfnmadd231sd xmm {k}, xmm, xmm
vfnmadd132sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) + c[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_fnmadd_ss (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fnmadd_ss (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm, xmm, xmm
vfnmadd213ss xmm, xmm, xmm
vfnmadd231ss xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
Performance
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_mask_fnmadd_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fnmadd_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm
vfnmadd213ss xmm {k}, xmm, xmm
vfnmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_mask3_fnmadd_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fnmadd_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm
vfnmadd213ss xmm {k}, xmm, xmm
vfnmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmadd132ss, vfnmadd213ss, vfnmadd231ss
__m128 _mm_maskz_fnmadd_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fnmadd_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmadd132ss xmm {k}, xmm, xmm
vfnmadd213ss xmm {k}, xmm, xmm
vfnmadd231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) + c[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m128d _mm_fnmsub_pd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fnmsub_pd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132pd xmm, xmm, xmm
vfnmsub213pd xmm, xmm, xmm
vfnmsub231pd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:128] := 0
Performance
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m128d _mm_mask_fnmsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fnmsub_pd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132pd
vfnmsub213pd
vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m128d _mm_mask3_fnmsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fnmsub_pd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132pd
vfnmsub213pd
vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m128d _mm_maskz_fnmsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fnmsub_pd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132pd
vfnmsub213pd
vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m256d _mm256_fnmsub_pd (__m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_fnmsub_pd (__m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmsub132pd ymm, ymm, ymm
vfnmsub213pd ymm, ymm, ymm
vfnmsub231pd ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m256d _mm256_mask_fnmsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
Synopsis
__m256d _mm256_mask_fnmsub_pd (__m256d a, __mmask8 k, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmsub132pd
vfnmsub213pd
vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m256d _mm256_mask3_fnmsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
Synopsis
__m256d _mm256_mask3_fnmsub_pd (__m256d a, __m256d b, __m256d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132pd
vfnmsub213pd
vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m256d _mm256_maskz_fnmsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
Synopsis
__m256d _mm256_maskz_fnmsub_pd (__mmask8 k, __m256d a, __m256d b, __m256d c)
#include "immintrin.h"
Instruction: vfnmsub132pd
vfnmsub213pd
vfnmsub231pd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_fnmsub_pd (__m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_fnmsub_pd (__m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm
vfnmsub213pd zmm {k}, zmm, zmm
vfnmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_mask_fnmsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
Synopsis
__m512d _mm512_mask_fnmsub_pd (__m512d a, __mmask8 k, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm
vfnmsub213pd zmm {k}, zmm, zmm
vfnmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_mask3_fnmsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
Synopsis
__m512d _mm512_mask3_fnmsub_pd (__m512d a, __m512d b, __m512d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm
vfnmsub213pd zmm {k}, zmm, zmm
vfnmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_maskz_fnmsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
Synopsis
__m512d _mm512_maskz_fnmsub_pd (__mmask8 k, __m512d a, __m512d b, __m512d c)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm
vfnmsub213pd zmm {k}, zmm, zmm
vfnmsub231pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fnmsub_ps (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ps xmm, xmm, xmm
vfnmsub213ps xmm, xmm, xmm
vfnmsub231ps xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:128] := 0
Performance
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m128 _mm_mask_fnmsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fnmsub_ps (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ps
vfnmsub213ps
vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m128 _mm_mask3_fnmsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fnmsub_ps (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132ps
vfnmsub213ps
vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m128 _mm_maskz_fnmsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fnmsub_ps (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ps
vfnmsub213ps
vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m256 _mm256_fnmsub_ps (__m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_fnmsub_ps (__m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmsub132ps ymm, ymm, ymm
vfnmsub213ps ymm, ymm, ymm
vfnmsub231ps ymm, ymm, ymm
CPUID Flags: FMA
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m256 _mm256_mask_fnmsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
Synopsis
__m256 _mm256_mask_fnmsub_ps (__m256 a, __mmask8 k, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmsub132ps
vfnmsub213ps
vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m256 _mm256_mask3_fnmsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
Synopsis
__m256 _mm256_mask3_fnmsub_ps (__m256 a, __m256 b, __m256 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132ps
vfnmsub213ps
vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m256 _mm256_maskz_fnmsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
Synopsis
__m256 _mm256_maskz_fnmsub_ps (__mmask8 k, __m256 a, __m256 b, __m256 c)
#include "immintrin.h"
Instruction: vfnmsub132ps
vfnmsub213ps
vfnmsub231ps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_fnmsub_ps (__m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_fnmsub_ps (__m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm
vfnmsub213ps zmm {k}, zmm, zmm
vfnmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_mask_fnmsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
Synopsis
__m512 _mm512_mask_fnmsub_ps (__m512 a, __mmask16 k, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm
vfnmsub213ps zmm {k}, zmm, zmm
vfnmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_mask3_fnmsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
Synopsis
__m512 _mm512_mask3_fnmsub_ps (__m512 a, __m512 b, __m512 c, __mmask16 k)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm
vfnmsub213ps zmm {k}, zmm, zmm
vfnmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_maskz_fnmsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
Synopsis
__m512 _mm512_maskz_fnmsub_ps (__mmask16 k, __m512 a, __m512 b, __m512 c)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm
vfnmsub213ps zmm {k}, zmm, zmm
vfnmsub231ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_fnmsub_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
Synopsis
__m512d _mm512_fnmsub_round_pd (__m512d a, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm {er}
vfnmsub213pd zmm {k}, zmm, zmm {er}
vfnmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, subtract packed elements in
c from the negated intermediate result, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ENDFOR
dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_mask_fnmsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
Synopsis
__m512d _mm512_mask_fnmsub_round_pd (__m512d a, __mmask8 k, __m512d b, __m512d c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm {er}
vfnmsub213pd zmm {k}, zmm, zmm {er}
vfnmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, subtract packed elements in
c from the negated intermediate result, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_mask3_fnmsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
Synopsis
__m512d _mm512_mask3_fnmsub_round_pd (__m512d a, __m512d b, __m512d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm {er}
vfnmsub213pd zmm {k}, zmm, zmm {er}
vfnmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, subtract packed elements in
c from the negated intermediate result, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := c[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132pd, vfnmsub213pd, vfnmsub231pd
__m512d _mm512_maskz_fnmsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
Synopsis
__m512d _mm512_maskz_fnmsub_round_pd (__mmask8 k, __m512d a, __m512d b, __m512d c, const int rounding)
#include "immintrin.h"
Instruction: vfnmsub132pd zmm {k}, zmm, zmm {er}
vfnmsub213pd zmm {k}, zmm, zmm {er}
vfnmsub231pd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, subtract packed elements in
c from the negated intermediate result, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := -(a[i+63:i] * b[i+63:i]) - c[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_fnmsub_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
Synopsis
__m512 _mm512_fnmsub_round_ps (__m512 a, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm {er}
vfnmsub213ps zmm {k}, zmm, zmm {er}
vfnmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, subtract packed elements in
c from the negated intermediate result, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ENDFOR
dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_mask_fnmsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
Synopsis
__m512 _mm512_mask_fnmsub_round_ps (__m512 a, __mmask16 k, __m512 b, __m512 c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm {er}
vfnmsub213ps zmm {k}, zmm, zmm {er}
vfnmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, subtract packed elements in
c from the negated intermediate result, and store the results in
dst using writemask
k (elements are copied from
a when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_mask3_fnmsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
Synopsis
__m512 _mm512_mask3_fnmsub_round_ps (__m512 a, __m512 b, __m512 c, __mmask16 k, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm {er}
vfnmsub213ps zmm {k}, zmm, zmm {er}
vfnmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, subtract packed elements in
c from the negated intermediate result, and store the results in
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := c[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132ps, vfnmsub213ps, vfnmsub231ps
__m512 _mm512_maskz_fnmsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
Synopsis
__m512 _mm512_maskz_fnmsub_round_ps (__mmask16 k, __m512 a, __m512 b, __m512 c, const int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ps zmm {k}, zmm, zmm {er}
vfnmsub213ps zmm {k}, zmm, zmm {er}
vfnmsub231ps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, subtract packed elements in
c from the negated intermediate result, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := -(a[i+31:i] * b[i+31:i]) - c[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_mask_fnmsub_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
Synopsis
__m128d _mm_mask_fnmsub_round_sd (__m128d a, __mmask8 k, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm {er}
vfnmsub213sd xmm {k}, xmm, xmm {er}
vfnmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the negated intermediate result. Store the result in the lower element of
dst using writemask
k (the element is copied from
c when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_mask3_fnmsub_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
Synopsis
__m128d _mm_mask3_fnmsub_round_sd (__m128d a, __m128d b, __m128d c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm {er}
vfnmsub213sd xmm {k}, xmm, xmm {er}
vfnmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the negated intermediate result. Store the result in the lower element of
dst using writemask
k (the element is copied from
c when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_maskz_fnmsub_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
Synopsis
__m128d _mm_maskz_fnmsub_round_sd (__mmask8 k, __m128d a, __m128d b, __m128d c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm {er}
vfnmsub213sd xmm {k}, xmm, xmm {er}
vfnmsub231sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the negated intermediate result. Store the result in
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_mask_fnmsub_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
Synopsis
__m128 _mm_mask_fnmsub_round_ss (__m128 a, __mmask8 k, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm {er}
vfnmsub213ss xmm {k}, xmm, xmm {er}
vfnmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the negated intermediate result. Store the result in the lower element of
dst using writemask
k (the element is copied from
c when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_mask3_fnmsub_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
Synopsis
__m128 _mm_mask3_fnmsub_round_ss (__m128 a, __m128 b, __m128 c, __mmask8 k, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm {er}
vfnmsub213ss xmm {k}, xmm, xmm {er}
vfnmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, subtract the lower element in
c from the negated intermediate result, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst using writemask
k (elements are copied from
c when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_maskz_fnmsub_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
Synopsis
__m128 _mm_maskz_fnmsub_round_ss (__mmask8 k, __m128 a, __m128 b, __m128 c, int rounding)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm {er}
vfnmsub213ss xmm {k}, xmm, xmm {er}
vfnmsub231ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in
a and
b, and subtract the lower element in
c from the negated intermediate result. Store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_fnmsub_sd (__m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_fnmsub_sd (__m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm, xmm, xmm
vfnmsub213sd xmm, xmm, xmm
vfnmsub231sd xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_mask_fnmsub_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
Synopsis
__m128d _mm_mask_fnmsub_sd (__m128d a, __mmask8 k, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm
vfnmsub213sd xmm {k}, xmm, xmm
vfnmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := a[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_mask3_fnmsub_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
Synopsis
__m128d _mm_mask3_fnmsub_sd (__m128d a, __m128d b, __m128d c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm
vfnmsub213sd xmm {k}, xmm, xmm
vfnmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := c[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmsub132sd, vfnmsub213sd, vfnmsub231sd
__m128d _mm_maskz_fnmsub_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
Synopsis
__m128d _mm_maskz_fnmsub_sd (__mmask8 k, __m128d a, __m128d b, __m128d c)
#include "immintrin.h"
Instruction: vfnmsub132sd xmm {k}, xmm, xmm
vfnmsub213sd xmm {k}, xmm, xmm
vfnmsub231sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := -(a[63:0] * b[63:0]) - c[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_fnmsub_ss (__m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_fnmsub_ss (__m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm, xmm, xmm
vfnmsub213ss xmm, xmm, xmm
vfnmsub231ss xmm, xmm, xmm
CPUID Flags: FMA
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
Performance
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_mask_fnmsub_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
Synopsis
__m128 _mm_mask_fnmsub_ss (__m128 a, __mmask8 k, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm
vfnmsub213ss xmm {k}, xmm, xmm
vfnmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := a[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_mask3_fnmsub_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
Synopsis
__m128 _mm_mask3_fnmsub_ss (__m128 a, __m128 b, __m128 c, __mmask8 k)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm
vfnmsub213ss xmm {k}, xmm, xmm
vfnmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := c[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfnmsub132ss, vfnmsub213ss, vfnmsub231ss
__m128 _mm_maskz_fnmsub_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
Synopsis
__m128 _mm_maskz_fnmsub_ss (__mmask8 k, __m128 a, __m128 b, __m128 c)
#include "immintrin.h"
Instruction: vfnmsub132ss xmm {k}, xmm, xmm
vfnmsub213ss xmm {k}, xmm, xmm
vfnmsub231ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := -(a[31:0] * b[31:0]) - c[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vfpclasspd
__mmask8 _mm_fpclass_pd_mask (__m128d a, int imm8)
Synopsis
__mmask8 _mm_fpclass_pd_mask (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512VL + AVX512DQ
Description
Test packed double-precision (64-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k.
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 1
i := j*64
k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
ENDFOR
k[MAX:2] := 0
vfpclasspd
__mmask8 _mm_mask_fpclass_pd_mask (__mmask8 k1, __m128d a, int imm8)
Synopsis
__mmask8 _mm_mask_fpclass_pd_mask (__mmask8 k1, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512VL + AVX512DQ
Description
Test packed double-precision (64-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k using zeromask
k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vfpclasspd
__mmask8 _mm256_fpclass_pd_mask (__m256d a, int imm8)
Synopsis
__mmask8 _mm256_fpclass_pd_mask (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512VL + AVX512DQ
Description
Test packed double-precision (64-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k.
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 3
i := j*64
k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
ENDFOR
k[MAX:4] := 0
vfpclasspd
__mmask8 _mm256_mask_fpclass_pd_mask (__mmask8 k1, __m256d a, int imm8)
Synopsis
__mmask8 _mm256_mask_fpclass_pd_mask (__mmask8 k1, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512VL + AVX512DQ
Description
Test packed double-precision (64-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k using zeromask
k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vfpclasspd
__mmask8 _mm512_fpclass_pd_mask (__m512d a, int imm8)
Synopsis
__mmask8 _mm512_fpclass_pd_mask (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512DQ
Description
Test packed double-precision (64-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k.
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 7
i := j*64
k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
ENDFOR
k[MAX:8] := 0
vfpclasspd
__mmask8 _mm512_mask_fpclass_pd_mask (__mmask8 k1, __m512d a, int imm8)
Synopsis
__mmask8 _mm512_mask_fpclass_pd_mask (__mmask8 k1, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasspd
CPUID Flags: AVX512DQ
Description
Test packed double-precision (64-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k using zeromask
k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := CheckFPClass_FP64(a[i+63:i], imm8[7:0])
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vfpclassps
__mmask8 _mm_fpclass_ps_mask (__m128 a, int imm8)
Synopsis
__mmask8 _mm_fpclass_ps_mask (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512VL + AVX512DQ
Description
Test packed single-precision (32-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k.
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 3
i := j*32
k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
ENDFOR
k[MAX:4] := 0
vfpclassps
__mmask8 _mm_mask_fpclass_ps_mask (__mmask8 k1, __m128 a, int imm8)
Synopsis
__mmask8 _mm_mask_fpclass_ps_mask (__mmask8 k1, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512VL + AVX512DQ
Description
Test packed single-precision (32-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k using zeromask
k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vfpclassps
__mmask8 _mm256_fpclass_ps_mask (__m256 a, int imm8)
Synopsis
__mmask8 _mm256_fpclass_ps_mask (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512VL + AVX512DQ
Description
Test packed single-precision (32-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k.
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 7
i := j*32
k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
ENDFOR
k[MAX:8] := 0
vfpclassps
__mmask8 _mm256_mask_fpclass_ps_mask (__mmask8 k1, __m256 a, int imm8)
Synopsis
__mmask8 _mm256_mask_fpclass_ps_mask (__mmask8 k1, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512VL + AVX512DQ
Description
Test packed single-precision (32-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k using zeromask
k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vfpclassps
__mmask16 _mm512_fpclass_ps_mask (__m512 a, int imm8)
Synopsis
__mmask16 _mm512_fpclass_ps_mask (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512DQ
Description
Test packed single-precision (32-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k.
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 15
i := j*32
k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
ENDFOR
k[MAX:16] := 0
vfpclassps
__mmask16 _mm512_mask_fpclass_ps_mask (__mmask16 k1, __m512 a, int imm8)
Synopsis
__mmask16 _mm512_mask_fpclass_ps_mask (__mmask16 k1, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassps
CPUID Flags: AVX512DQ
Description
Test packed single-precision (32-bit) floating-point elements in
a for special categories specified by
imm8, and store the results in mask vector
k using zeromask
k1 (elements are zeroed out when the corresponding mask bit is not set).
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := CheckFPClass_FP32(a[i+31:i], imm8[7:0])
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vfpclasssd
__mmask8 _mm_fpclass_sd_mask (__m128d a, int imm8)
Synopsis
__mmask8 _mm_fpclass_sd_mask (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasssd
CPUID Flags: AVX512DQ
Description
Test the lower double-precision (64-bit) floating-point element in
a for special categories specified by
imm8, and store the result in mask vector
k.
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
k[MAX:1] := 0
vfpclasssd
__mmask8 _mm_mask_fpclass_sd_mask (__mmask8 k1, __m128d a, int imm8)
Synopsis
__mmask8 _mm_mask_fpclass_sd_mask (__mmask8 k1, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vfpclasssd
CPUID Flags: AVX512DQ
Description
Test the lower double-precision (64-bit) floating-point element in
a for special categories specified by
imm8, and store the result in mask vector
k using zeromask
k1 (the element is zeroed out when mask bit 0 is not set).
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
IF k1[0]
k[0] := CheckFPClass_FP64(a[63:0], imm8[7:0])
ELSE
k[0] := 0
FI
k[MAX:1] := 0
vfpclassss
__mmask8 _mm_fpclass_ss_mask (__m128 a, int imm8)
Synopsis
__mmask8 _mm_fpclass_ss_mask (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassss
CPUID Flags: AVX512DQ
Description
Test the lower single-precision (32-bit) floating-point element in
a for special categories specified by
imm8, and store the result in mask vector
k.
imm" can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
k[MAX:1] := 0
vfpclassss
__mmask8 _mm_mask_fpclass_ss_mask (__mmask8 k1, __m128 a, int imm8)
Synopsis
__mmask8 _mm_mask_fpclass_ss_mask (__mmask8 k1, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vfpclassss
CPUID Flags: AVX512DQ
Description
Test the lower single-precision (32-bit) floating-point element in
a for special categories specified by
imm8, and store the result in mask vector
k using zeromask
k1 (the element is zeroed out when mask bit 0 is not set).
imm can be a combination of:
0x01 // QNaN
0x02 // Positive Zero
0x04 // Negative Zero
0x08 // Positive Infinity
0x10 // Negative Infinity
0x20 // Denormal
0x40 // Negative
0x80 // SNaN
Operation
IF k1[0]
k[0] := CheckFPClass_FP32(a[31:0], imm8[7:0])
ELSE
k[0] := 0
FI
k[MAX:1] := 0
void _mm_free (void * mem_addr)
Synopsis
void _mm_free (void * mem_addr)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Free aligned memory that was allocated with _mm_malloc.
fxrstor
void _fxrstor (void * mem_addr)
Synopsis
void _fxrstor (void * mem_addr)
#include "immintrin.h"
Instruction: fxrstor MEMmfpxenv
CPUID Flags: FXSR
Description
Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at mem_addr. This data should have been written to memory previously using the FXSAVE instruction, and in the same format as required by the operating mode. mem_addr must be aligned on a 16-byte boundary.
Operation
(x87 FPU, MMX, XMM7-XMM0, MXCSR) := Load(MEM[mem_addr])
fxrstor64
void _fxrstor64 (void * mem_addr)
Synopsis
void _fxrstor64 (void * mem_addr)
#include "immintrin.h"
Instruction: fxrstor64 MEMmfpxenv
CPUID Flags: FXSR
Description
Reload the x87 FPU, MMX technology, XMM, and MXCSR registers from the 512-byte memory image at mem_addr. This data should have been written to memory previously using the FXSAVE64 instruction, and in the same format as required by the operating mode. mem_addr must be aligned on a 16-byte boundary.
Operation
(x87 FPU, MMX, XMM7-XMM0, MXCSR) := Load(MEM[mem_addr])
fxsave
void _fxsave (void * mem_addr)
Synopsis
void _fxsave (void * mem_addr)
#include "immintrin.h"
Instruction: fxsave MEMmfpxenv
CPUID Flags: FXSR
Description
Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at mem_addr. The clayout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.
Operation
MEM[mem_addr+511*8:mem_addr] := Fxsave(x87 FPU, MMX, XMM7-XMM0, MXCSR)
fxsave64
void _fxsave64 (void * mem_addr)
Synopsis
void _fxsave64 (void * mem_addr)
#include "immintrin.h"
Instruction: fxsave64 MEMmfpxenv
CPUID Flags: FXSR
Description
Save the current state of the x87 FPU, MMX technology, XMM, and MXCSR registers to a 512-byte memory location at mem_addr. The layout of the 512-byte region depends on the operating mode. Bytes [511:464] are available for software use and will not be overwritten by the processor.
Operation
MEM[mem_addr+511*8:mem_addr] := Fxsave64(x87 FPU, MMX, XMM7-XMM0, MXCSR)
unsigned int _MM_GET_EXCEPTION_MASK ()
Synopsis
unsigned int _MM_GET_EXCEPTION_MASK ()
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT
Operation
dst[31:0] := MXCSR & _MM_MASK_MASK
unsigned int _MM_GET_EXCEPTION_STATE ()
Synopsis
unsigned int _MM_GET_EXCEPTION_STATE ()
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT
Operation
dst[31:0] := MXCSR & _MM_EXCEPT_MASK
unsigned int _MM_GET_FLUSH_ZERO_MODE ()
Synopsis
unsigned int _MM_GET_FLUSH_ZERO_MODE ()
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
Operation
dst[31:0] := MXCSR & _MM_FLUSH_MASK
unsigned int _MM_GET_ROUNDING_MODE ()
Synopsis
unsigned int _MM_GET_ROUNDING_MODE ()
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
Operation
dst[31:0] := MXCSR & _MM_ROUND_MASK
stmxcsr
unsigned int _mm_getcsr (void)
Synopsis
unsigned int _mm_getcsr (void)
#include "xmmintrin.h"
Instruction: stmxcsr MEMd
CPUID Flags: SSE
Description
Get the unsigned 32-bit value of the MXCSR control and status register.
Operation
dst[31:0] := MXCSR
Performance
vgetexppd
__m128d _mm_getexp_pd (__m128d a)
Synopsis
__m128d _mm_getexp_pd (__m128d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vgetexppd
__m128d _mm_mask_getexp_pd (__m128d src, __mmask8 k, __m128d a)
Synopsis
__m128d _mm_mask_getexp_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vgetexppd
__m128d _mm_maskz_getexp_pd (__mmask8 k, __m128d a)
Synopsis
__m128d _mm_maskz_getexp_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vgetexppd
__m256d _mm256_getexp_pd (__m256d a)
Synopsis
__m256d _mm256_getexp_pd (__m256d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vgetexppd
__m256d _mm256_mask_getexp_pd (__m256d src, __mmask8 k, __m256d a)
Synopsis
__m256d _mm256_mask_getexp_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vgetexppd
__m256d _mm256_maskz_getexp_pd (__mmask8 k, __m256d a)
Synopsis
__m256d _mm256_maskz_getexp_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vgetexppd
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vgetexppd
__m512d _mm512_getexp_pd (__m512d a)
Synopsis
__m512d _mm512_getexp_pd (__m512d a)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vgetexppd
__m512d _mm512_mask_getexp_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_getexp_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vgetexppd
__m512d _mm512_maskz_getexp_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_getexp_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vgetexpps
__m128 _mm_getexp_ps (__m128 a)
Synopsis
__m128 _mm_getexp_ps (__m128 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vgetexpps
__m128 _mm_mask_getexp_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_getexp_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vgetexpps
__m128 _mm_maskz_getexp_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_getexp_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vgetexpps
__m256 _mm256_getexp_ps (__m256 a)
Synopsis
__m256 _mm256_getexp_ps (__m256 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vgetexpps
__m256 _mm256_mask_getexp_ps (__m256 src, __mmask8 k, __m256 a)
Synopsis
__m256 _mm256_mask_getexp_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vgetexpps
__m256 _mm256_maskz_getexp_ps (__mmask8 k, __m256 a)
Synopsis
__m256 _mm256_maskz_getexp_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vgetexpps
CPUID Flags: AVX512VL + AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vgetexpps
__m512 _mm512_getexp_ps (__m512 a)
Synopsis
__m512 _mm512_getexp_ps (__m512 a)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vgetexpps
__m512 _mm512_mask_getexp_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_getexp_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vgetexpps
__m512 _mm512_maskz_getexp_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_getexp_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vgetexppd
__m512d _mm512_getexp_round_pd (__m512d a, int rounding)
Synopsis
__m512d _mm512_getexp_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in
a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in
dst. This intrinsic essentially calculates
floor(log2(x)) for each element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vgetexppd
__m512d _mm512_mask_getexp_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
Synopsis
__m512d _mm512_mask_getexp_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in
a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
floor(log2(x)) for each element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vgetexppd
__m512d _mm512_maskz_getexp_round_pd (__mmask8 k, __m512d a, int rounding)
Synopsis
__m512d _mm512_maskz_getexp_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vgetexppd zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in
a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
floor(log2(x)) for each element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vgetexpps
__m512 _mm512_getexp_round_ps (__m512 a, int rounding)
Synopsis
__m512 _mm512_getexp_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in
a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in
dst. This intrinsic essentially calculates
floor(log2(x)) for each element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vgetexpps
__m512 _mm512_mask_getexp_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_mask_getexp_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in
a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
floor(log2(x)) for each element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vgetexpps
__m512 _mm512_maskz_getexp_round_ps (__mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_maskz_getexp_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vgetexpps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in
a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
floor(log2(x)) for each element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vgetexpsd
__m128d _mm_getexp_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_getexp_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the exponent of the lower double-precision (64-bit) floating-point element in
b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst. This intrinsic essentially calculates
floor(log2(x)) for the lower element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := ConvertExpFP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vgetexpsd
__m128d _mm_mask_getexp_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_getexp_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the exponent of the lower double-precision (64-bit) floating-point element in
b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst. This intrinsic essentially calculates
floor(log2(x)) for the lower element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := ConvertExpFP64(b[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vgetexpsd
__m128d _mm_maskz_getexp_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_getexp_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the exponent of the lower double-precision (64-bit) floating-point element in
b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst. This intrinsic essentially calculates
floor(log2(x)) for the lower element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := ConvertExpFP64(b[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vgetexpss
__m128 _mm_getexp_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_getexp_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the exponent of the lower single-precision (32-bit) floating-point element in
b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst. This intrinsic essentially calculates
floor(log2(x)) for the lower element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := ConvertExpFP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vgetexpss
__m128 _mm_mask_getexp_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_getexp_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the exponent of the lower single-precision (32-bit) floating-point element in
b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst. This intrinsic essentially calculates
floor(log2(x)) for the lower element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := ConvertExpFP32(b[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vgetexpss
__m128 _mm_maskz_getexp_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_getexp_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Convert the exponent of the lower single-precision (32-bit) floating-point element in
b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst. This intrinsic essentially calculates
floor(log2(x)) for the lower element.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := ConvertExpFP32(b[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vgetexpsd
__m128d _mm_getexp_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_getexp_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Operation
dst[63:0] := ConvertExpFP64(b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vgetexpsd
__m128d _mm_mask_getexp_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_getexp_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Operation
IF k[0]
dst[63:0] := ConvertExpFP64(b[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vgetexpsd
__m128d _mm_maskz_getexp_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_getexp_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vgetexpsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Operation
IF k[0]
dst[63:0] := ConvertExpFP64(b[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vgetexpss
__m128 _mm_getexp_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_getexp_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Operation
dst[31:0] := ConvertExpFP32(b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vgetexpss
__m128 _mm_mask_getexp_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_getexp_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Operation
IF k[0]
dst[31:0] := ConvertExpFP32(b[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vgetexpss
__m128 _mm_maskz_getexp_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_getexp_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vgetexpss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
Operation
IF k[0]
dst[31:0] := ConvertExpFP32(b[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vgetmantpd
__m128d _mm_getmant_pd (__m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128d _mm_getmant_pd (__m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ENDFOR
dst[MAX:128] := 0
vgetmantpd
__m128d _mm_mask_getmant_pd (__m128d src, __mmask8 k, __m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128d _mm_mask_getmant_pd (__m128d src, __mmask8 k, __m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vgetmantpd
__m128d _mm_maskz_getmant_pd (__mmask8 k, __m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128d _mm_maskz_getmant_pd (__mmask8 k, __m128d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vgetmantpd
__m256d _mm256_getmant_pd (__m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m256d _mm256_getmant_pd (__m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ENDFOR
dst[MAX:256] := 0
vgetmantpd
__m256d _mm256_mask_getmant_pd (__m256d src, __mmask8 k, __m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m256d _mm256_mask_getmant_pd (__m256d src, __mmask8 k, __m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vgetmantpd
__m256d _mm256_maskz_getmant_pd (__mmask8 k, __m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m256d _mm256_maskz_getmant_pd (__mmask8 k, __m256d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vgetmantpd
__m512d _mm512_getmant_pd (__m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m512d _mm512_getmant_pd (__m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ENDFOR
dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_mask_getmant_pd (__m512d src, __mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m512d _mm512_mask_getmant_pd (__m512d src, __mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_maskz_getmant_pd (__mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m512d _mm512_maskz_getmant_pd (__mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vgetmantps
__m128 _mm_getmant_ps (__m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128 _mm_getmant_ps (__m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ENDFOR
dst[MAX:128] := 0
vgetmantps
__m128 _mm_mask_getmant_ps (__m128 src, __mmask8 k, __m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128 _mm_mask_getmant_ps (__m128 src, __mmask8 k, __m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vgetmantps
__m128 _mm_maskz_getmant_ps (__mmask8 k, __m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128 _mm_maskz_getmant_ps (__mmask8 k, __m128 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vgetmantps
__m256 _mm256_getmant_ps (__m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m256 _mm256_getmant_ps (__m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ENDFOR
dst[MAX:256] := 0
vgetmantps
__m256 _mm256_mask_getmant_ps (__m256 src, __mmask8 k, __m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m256 _mm256_mask_getmant_ps (__m256 src, __mmask8 k, __m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vgetmantps
__m256 _mm256_maskz_getmant_ps (__mmask8 k, __m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m256 _mm256_maskz_getmant_ps (__mmask8 k, __m256 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps
CPUID Flags: AVX512VL + AVX512F
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vgetmantps
__m512 _mm512_getmant_ps (__m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m512 _mm512_getmant_ps (__m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ENDFOR
dst[MAX:512] := 0
vgetmantps
__m512 _mm512_mask_getmant_ps (__m512 src, __mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m512 _mm512_mask_getmant_ps (__m512 src, __mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vgetmantps
__m512 _mm512_maskz_getmant_ps (__mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m512 _mm512_maskz_getmant_ps (__mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_getmant_round_pd (__m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m512d _mm512_getmant_round_pd (__m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ENDFOR
dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_mask_getmant_round_pd (__m512d src, __mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m512d _mm512_mask_getmant_round_pd (__m512d src, __mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vgetmantpd
__m512d _mm512_maskz_getmant_round_pd (__mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m512d _mm512_maskz_getmant_round_pd (__mmask8 k, __m512d a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantpd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F
Description
Normalize the mantissas of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := GetNormalizedMantissa(a[i+63:i], sc, interv)
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vgetmantps
__m512 _mm512_getmant_round_ps (__m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m512 _mm512_getmant_round_ps (__m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ENDFOR
dst[MAX:512] := 0
vgetmantps
__m512 _mm512_mask_getmant_round_ps (__m512 src, __mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m512 _mm512_mask_getmant_round_ps (__m512 src, __mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vgetmantps
__m512 _mm512_maskz_getmant_round_ps (__mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m512 _mm512_maskz_getmant_round_ps (__mmask16 k, __m512 a, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F
Description
Normalize the mantissas of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := GetNormalizedMantissa(a[i+31:i], sc, interv)
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vgetmantsd
__m128d _mm_getmant_round_sd (__m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m128d _mm_getmant_round_sd (__m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst, and copy the upper element from
b to the upper element of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vgetmantsd
__m128d _mm_mask_getmant_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m128d _mm_mask_getmant_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vgetmantsd
__m128d _mm_maskz_getmant_round_sd (__mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m128d _mm_maskz_getmant_round_sd (__mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vgetmantss
__m128 _mm_getmant_round_ss (__m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m128 _mm_getmant_round_ss (__m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst, and copy the upper 3 packed elements from
b to the upper elements of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vgetmantss
__m128 _mm_mask_getmant_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m128 _mm_mask_getmant_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vgetmantss
__m128 _mm_maskz_getmant_round_ss (__mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
Synopsis
__m128 _mm_maskz_getmant_round_ss (__mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc, int rounding)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
ELSE
dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vgetmantsd
__m128d _mm_getmant_sd (__m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128d _mm_getmant_sd (__m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst, and copy the upper element from
b to the upper element of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vgetmantsd
__m128d _mm_mask_getmant_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128d _mm_mask_getmant_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
IF k[0]
dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vgetmantsd
__m128d _mm_maskz_getmant_sd (__mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128d _mm_maskz_getmant_sd (__mmask8 k, __m128d a, __m128d b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantsd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
IF k[0]
dst[63:0] := GetNormalizedMantissa(a[63:0], sc, interv)
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vgetmantss
__m128 _mm_getmant_ss (__m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128 _mm_getmant_ss (__m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst, and copy the upper 3 packed elements from
b to the upper elements of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vgetmantss
__m128 _mm_mask_getmant_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128 _mm_mask_getmant_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
IF k[0]
dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vgetmantss
__m128 _mm_maskz_getmant_ss (__mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
Synopsis
__m128 _mm_maskz_getmant_ss (__mmask8 k, __m128 a, __m128 b, _MM_MANTISSA_NORM_ENUM interv, _MM_MANTISSA_SIGN_ENUM sc)
#include "immintrin.h"
Instruction: vgetmantss xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Normalize the mantissas of the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst. This intrinsic essentially calculates
±(2^k)*|x.significand|, where
k depends on the interval range defined by
interv and the sign depends on
sc and the source sign.
The mantissa is normalized to the interval specified by
interv, which can take the following values:
_MM_MANT_NORM_1_2 // interval [1, 2)
_MM_MANT_NORM_p5_2 // interval [0.5, 2)
_MM_MANT_NORM_p5_1 // interval [0.5, 1)
_MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
The sign is determined by
sc which can take the following values:
_MM_MANT_SIGN_src // sign = sign(src)
_MM_MANT_SIGN_zero // sign = 0
_MM_MANT_SIGN_nan // dst = NaN if sign(src) = 1
Operation
IF k[0]
dst[31:0] := GetNormalizedMantissa(a[31:0], sc, interv)
ELSE
dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vgmaxpd
__m512d _mm512_gmax_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_gmax_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vgmaxpd zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in a and b, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
vgmaxpd
__m512d _mm512_mask_gmax_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_gmax_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vgmaxpd zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := FpMax(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vgmaxps
__m512 _mm512_gmax_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_gmax_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vgmaxps
__m512 _mm512_mask_gmax_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_gmax_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FpMax(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vgmaxabsps
__m512 _mm512_gmaxabs_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_gmaxabs_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxabsps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
ENDFOR
dst[MAX:512] := 0
vgmaxabsps
__m512 _mm512_mask_gmaxabs_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_gmaxabs_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxabsps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vgminpd
__m512d _mm512_gmin_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_gmin_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vgminpd zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the minimum of each pair of corresponding elements in packed double-precision (64-bit) floating-point elements in a and b, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
vgminpd
__m512d _mm512_mask_gmin_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_gmin_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vgminpd zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of each pair of corresponding elements of packed double-precision (64-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := FpMin(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vgminps
__m512 _mm512_gmin_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_gmin_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgminps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the minimum of each pair of corresponding elements in packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vgminps
__m512 _mm512_mask_gmin_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_gmin_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgminps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FpMin(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
phaddw
__m128i _mm_hadd_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_hadd_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phaddw xmm, xmm
CPUID Flags: SSSE3
Description
Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.
Operation
dst[15:0] := a[31:16] + a[15:0]
dst[31:16] := a[63:48] + a[47:32]
dst[47:32] := a[95:80] + a[79:64]
dst[63:48] := a[127:112] + a[111:96]
dst[79:64] := b[31:16] + b[15:0]
dst[95:80] := b[63:48] + b[47:32]
dst[111:96] := b[95:80] + b[79:64]
dst[127:112] := b[127:112] + b[111:96]
Performance
vphaddw
__m256i _mm256_hadd_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_hadd_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphaddw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.
Operation
dst[15:0] := a[31:16] + a[15:0]
dst[31:16] := a[63:48] + a[47:32]
dst[47:32] := a[95:80] + a[79:64]
dst[63:48] := a[127:112] + a[111:96]
dst[79:64] := b[31:16] + b[15:0]
dst[95:80] := b[63:48] + b[47:32]
dst[111:96] := b[95:80] + b[79:64]
dst[127:112] := b[127:112] + b[111:96]
dst[143:128] := a[159:144] + a[143:128]
dst[159:144] := a[191:176] + a[175:160]
dst[175:160] := a[223:208] + a[207:192]
dst[191:176] := a[255:240] + a[239:224]
dst[207:192] := b[127:112] + b[143:128]
dst[223:208] := b[159:144] + b[175:160]
dst[239:224] := b[191:176] + b[207:192]
dst[255:240] := b[223:208] + b[239:224]
dst[MAX:256] := 0
Performance
phaddd
__m128i _mm_hadd_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_hadd_epi32 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phaddd xmm, xmm
CPUID Flags: SSSE3
Description
Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.
Operation
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := a[127:96] + a[95:64]
dst[95:64] := b[63:32] + b[31:0]
dst[127:96] := b[127:96] + b[95:64]
Performance
vphaddd
__m256i _mm256_hadd_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_hadd_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphaddd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.
Operation
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := a[127:96] + a[95:64]
dst[95:64] := b[63:32] + b[31:0]
dst[127:96] := b[127:96] + b[95:64]
dst[159:128] := a[191:160] + a[159:128]
dst[191:160] := a[255:224] + a[223:192]
dst[223:192] := b[191:160] + b[159:128]
dst[255:224] := b[255:224] + b[223:192]
dst[MAX:256] := 0
Performance
haddpd
__m128d _mm_hadd_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_hadd_pd (__m128d a, __m128d b)
#include "pmmintrin.h"
Instruction: haddpd xmm, xmm
CPUID Flags: SSE3
Description
Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.
Operation
dst[63:0] := a[127:64] + a[63:0]
dst[127:64] := b[127:64] + b[63:0]
Performance
vhaddpd
__m256d _mm256_hadd_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_hadd_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vhaddpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Horizontally add adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.
Operation
dst[63:0] := a[127:64] + a[63:0]
dst[127:64] := b[127:64] + b[63:0]
dst[191:128] := a[255:192] + a[191:128]
dst[255:192] := b[255:192] + b[191:128]
dst[MAX:256] := 0
Performance
phaddw
__m64 _mm_hadd_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_hadd_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phaddw mm, mm
CPUID Flags: SSSE3
Description
Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.
Operation
dst[15:0] := a[31:16] + a[15:0]
dst[31:16] := a[63:48] + a[47:32]
dst[47:32] := b[31:16] + b[15:0]
dst[63:48] := b[63:48] + b[47:32]
phaddw
__m64 _mm_hadd_pi32 (__m64 a, __m64 b)
Synopsis
__m64 _mm_hadd_pi32 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phaddw mm, mm
CPUID Flags: SSSE3
Description
Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.
Operation
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := b[63:32] + b[31:0]
haddps
__m128 _mm_hadd_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_hadd_ps (__m128 a, __m128 b)
#include "pmmintrin.h"
Instruction: haddps xmm, xmm
CPUID Flags: SSE3
Description
Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.
Operation
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := a[127:96] + a[95:64]
dst[95:64] := b[63:32] + b[31:0]
dst[127:96] := b[127:96] + b[95:64]
Performance
vhaddps
__m256 _mm256_hadd_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_hadd_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vhaddps ymm, ymm, ymm
CPUID Flags: AVX
Description
Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.
Operation
dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := a[127:96] + a[95:64]
dst[95:64] := b[63:32] + b[31:0]
dst[127:96] := b[127:96] + b[95:64]
dst[159:128] := a[191:160] + a[159:128]
dst[191:160] := a[255:224] + a[223:192]
dst[223:192] := b[191:160] + b[159:128]
dst[255:224] := b[255:224] + b[223:192]
dst[MAX:256] := 0
Performance
phaddsw
__m128i _mm_hadds_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_hadds_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phaddsw xmm, xmm
CPUID Flags: SSSE3
Description
Horizontally add adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.
Operation
dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
dst[47:32] = Saturate_To_Int16(a[95:80] + a[79:64])
dst[63:48] = Saturate_To_Int16(a[127:112] + a[111:96])
dst[79:64] = Saturate_To_Int16(b[31:16] + b[15:0])
dst[95:80] = Saturate_To_Int16(b[63:48] + b[47:32])
dst[111:96] = Saturate_To_Int16(b[95:80] + b[79:64])
dst[127:112] = Saturate_To_Int16(b[127:112] + b[111:96])
Performance
vphaddsw
__m256i _mm256_hadds_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_hadds_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphaddsw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Horizontally add adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.
Operation
dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
dst[47:32] = Saturate_To_Int16(a[95:80] + a[79:64])
dst[63:48] = Saturate_To_Int16(a[127:112] + a[111:96])
dst[79:64] = Saturate_To_Int16(b[31:16] + b[15:0])
dst[95:80] = Saturate_To_Int16(b[63:48] + b[47:32])
dst[111:96] = Saturate_To_Int16(b[95:80] + b[79:64])
dst[127:112] = Saturate_To_Int16(b[127:112] + b[111:96])
dst[143:128] = Saturate_To_Int16(a[159:144] + a[143:128])
dst[159:144] = Saturate_To_Int16(a[191:176] + a[175:160])
dst[175:160] = Saturate_To_Int16( a[223:208] + a[207:192])
dst[191:176] = Saturate_To_Int16(a[255:240] + a[239:224])
dst[207:192] = Saturate_To_Int16(b[127:112] + b[143:128])
dst[223:208] = Saturate_To_Int16(b[159:144] + b[175:160])
dst[239:224] = Saturate_To_Int16(b[191-160] + b[159-128])
dst[255:240] = Saturate_To_Int16(b[255:240] + b[239:224])
dst[MAX:256] := 0
Performance
phaddsw
__m64 _mm_hadds_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_hadds_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phaddsw mm, mm
CPUID Flags: SSSE3
Description
Horizontally add adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.
Operation
dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
dst[47:32] = Saturate_To_Int16(b[31:16] + b[15:0])
dst[63:48] = Saturate_To_Int16(b[63:48] + b[47:32])
phsubw
__m128i _mm_hsub_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_hsub_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phsubw xmm, xmm
CPUID Flags: SSSE3
Description
Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.
Operation
dst[15:0] := a[15:0] - a[31:16]
dst[31:16] := a[47:32] - a[63:48]
dst[47:32] := a[79:64] - a[95:80]
dst[63:48] := a[111:96] - a[127:112]
dst[79:64] := b[15:0] - b[31:16]
dst[95:80] := b[47:32] - b[63:48]
dst[111:96] := b[79:64] - b[95:80]
dst[127:112] := b[111:96] - b[127:112]
Performance
vphsubw
__m256i _mm256_hsub_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_hsub_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphsubw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.
Operation
dst[15:0] := a[15:0] - a[31:16]
dst[31:16] := a[47:32] - a[63:48]
dst[47:32] := a[79:64] - a[95:80]
dst[63:48] := a[111:96] - a[127:112]
dst[79:64] := b[15:0] - b[31:16]
dst[95:80] := b[47:32] - b[63:48]
dst[111:96] := b[79:64] - b[95:80]
dst[127:112] := b[111:96] - b[127:112]
dst[143:128] := a[143:128] - a[159:144]
dst[159:144] := a[175:160] - a[191:176]
dst[175:160] := a[207:192] - a[223:208]
dst[191:176] := a[239:224] - a[255:240]
dst[207:192] := b[143:128] - b[159:144]
dst[223:208] := b[175:160] - b[191:176]
dst[239:224] := b[207:192] - b[223:208]
dst[255:240] := b[239:224] - b[255:240]
dst[MAX:256] := 0
Performance
phsubd
__m128i _mm_hsub_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_hsub_epi32 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phsubd xmm, xmm
CPUID Flags: SSSE3
Description
Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.
Operation
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := a[95:64] - a[127:96]
dst[95:64] := b[31:0] - b[63:32]
dst[127:96] := b[95:64] - b[127:96]
Performance
vphsubd
__m256i _mm256_hsub_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_hsub_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphsubd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.
Operation
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := a[95:64] - a[127:96]
dst[95:64] := b[31:0] - b[63:32]
dst[127:96] := b[95:64] - b[127:96]
dst[159:128] := a[159:128] - a[191:160]
dst[191:160] := a[223:192] - a[255:224]
dst[223:192] := b[159:128] - b[191:160]
dst[255:224] := b[223:192] - b[255:224]
dst[MAX:256] := 0
Performance
hsubpd
__m128d _mm_hsub_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_hsub_pd (__m128d a, __m128d b)
#include "pmmintrin.h"
Instruction: hsubpd xmm, xmm
CPUID Flags: SSE3
Description
Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.
Operation
dst[63:0] := a[63:0] - a[127:64]
dst[127:64] := b[63:0] - b[127:64]
Performance
vhsubpd
__m256d _mm256_hsub_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_hsub_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vhsubpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Horizontally subtract adjacent pairs of double-precision (64-bit) floating-point elements in a and b, and pack the results in dst.
Operation
dst[63:0] := a[63:0] - a[127:64]
dst[127:64] := b[63:0] - b[127:64]
dst[191:128] := a[191:128] - a[255:192]
dst[255:192] := b[191:128] - b[255:192]
dst[MAX:256] := 0
Performance
phsubw
__m64 _mm_hsub_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_hsub_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phsubw mm, mm
CPUID Flags: SSSE3
Description
Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack the signed 16-bit results in dst.
Operation
dst[15:0] := a[15:0] - a[31:16]
dst[31:16] := a[47:32] - a[63:48]
dst[47:32] := b[15:0] - b[31:16]
dst[63:48] := b[47:32] - b[63:48]
phsubd
__m64 _mm_hsub_pi32 (__m64 a, __m64 b)
Synopsis
__m64 _mm_hsub_pi32 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phsubd mm, mm
CPUID Flags: SSSE3
Description
Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack the signed 32-bit results in dst.
Operation
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := b[31:0] - b[63:32]
hsubps
__m128 _mm_hsub_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_hsub_ps (__m128 a, __m128 b)
#include "pmmintrin.h"
Instruction: hsubps xmm, xmm
CPUID Flags: SSE3
Description
Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.
Operation
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := a[95:64] - a[127:96]
dst[95:64] := b[31:0] - b[63:32]
dst[127:96] := b[95:64] - b[127:96]
Performance
vhsubps
__m256 _mm256_hsub_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_hsub_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vhsubps ymm, ymm, ymm
CPUID Flags: AVX
Description
Horizontally add adjacent pairs of single-precision (32-bit) floating-point elements in a and b, and pack the results in dst.
Operation
dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := a[95:64] - a[127:96]
dst[95:64] := b[31:0] - b[63:32]
dst[127:96] := b[95:64] - b[127:96]
dst[159:128] := a[159:128] - a[191:160]
dst[191:160] := a[223:192] - a[255:224]
dst[223:192] := b[159:128] - b[191:160]
dst[255:224] := b[223:192] - b[255:224]
dst[MAX:256] := 0
Performance
phsubsw
__m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_hsubs_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: phsubsw xmm, xmm
CPUID Flags: SSSE3
Description
Horizontally subtract adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.
Operation
dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80])
dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112])
dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16])
dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48])
dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80])
dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112])
Performance
vphsubsw
__m256i _mm256_hsubs_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_hsubs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vphsubsw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Horizontally subtract adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.
Operation
dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80])
dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112])
dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16])
dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48])
dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80])
dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112])
dst[143:128]= Saturate_To_Int16(a[143:128] - a[159:144])
dst[159:144] = Saturate_To_Int16(a[175:160] - a[191:176])
dst[175:160] = Saturate_To_Int16(a[207:192] - a[223:208])
dst[191:176] = Saturate_To_Int16(a[239:224] - a[255:240])
dst[207:192] = Saturate_To_Int16(b[143:128] - b[159:144])
dst[223:208] = Saturate_To_Int16(b[175:160] - b[191:176])
dst[239:224] = Saturate_To_Int16(b[207:192] - b[223:208])
dst[255:240] = Saturate_To_Int16(b[239:224] - b[255:240])
dst[MAX:256] := 0
Performance
phsubsw
__m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_hsubs_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: phsubsw mm, mm
CPUID Flags: SSSE3
Description
Horizontally subtract adjacent pairs of 16-bit integers in a and b using saturation, and pack the signed 16-bit results in dst.
Operation
dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
dst[47:32] = Saturate_To_Int16(b[15:0] - b[31:16])
dst[63:48] = Saturate_To_Int16(b[47:32] - b[63:48])
...
__m128d _mm_hypot_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_hypot_pd (__m128d a, __m128d b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_hypot_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_hypot_pd (__m256d a, __m256d b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_hypot_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_hypot_pd (__m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_hypot_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_hypot_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_hypot_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_hypot_ps (__m128 a, __m128 b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_hypot_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_hypot_ps (__m256 a, __m256 b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_hypot_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_hypot_ps (__m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_hypot_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_hypot_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpgatherdd
__m512i _mm512_i32extgather_epi32 (__m512i index, void const * mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
Synopsis
__m512i _mm512_i32extgather_epi32 (__m512i index, void const * mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpgatherdd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Up-converts 16 memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv to 32-bit integer elements and stores them in dst.
Operation
FOR j := 0 to 15
addr := MEM[mv + index[j] * scale]
i := j*32
CASE conv OF
_MM_UPCONV_EPI32_NONE:
dst[i+31:i] := addr[i+31:i]
_MM_UPCONV_EPI32_UINT8:
n := j*7
dst[i+31:i] := UInt8ToUInt32(addr[n+7:n])
_MM_UPCONV_EPI32_SINT8:
n := j*7
dst[i+31:i] := Int8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_UINT16:
n := j*16
dst[i+31:i] := UInt16ToUInt32(addr[n+15:n])
_MM_UPCONV_EPI32_SINT16:
n := j*16
dst[i+31:i] := Int16ToInt32(addr[n+15:n])
ESAC
ENDFOR
dst[MAX:512] := 0
vpgatherdd
__m512i _mm512_mask_i32extgather_epi32 (__m512i src, __mmask16 k, __m512i index, void const * mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
Synopsis
__m512i _mm512_mask_i32extgather_epi32 (__m512i src, __mmask16 k, __m512i index, void const * mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpgatherdd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Up-converts 16 single-precision (32-bit) memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv to 32-bit integer elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
addr := MEM[mv + index[j] * scale]
i := j*32
IF k[j]
CASE conv OF
_MM_UPCONV_EPI32_NONE:
dst[i+31:i] := addr[i+31:i]
_MM_UPCONV_EPI32_UINT8:
n := j*7
dst[i+31:i] := UInt8ToUInt32(addr[n+7:n])
_MM_UPCONV_EPI32_SINT8:
n := j*7
dst[i+31:i] := Int8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_UINT16:
n := j*16
dst[i+31:i] := UInt16ToUInt32(addr[n+15:n])
_MM_UPCONV_EPI32_SINT16:
n := j*16
dst[i+31:i] := Int16ToInt32(addr[n+15:n])
ESAC
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vgatherdps
__m512 _mm512_i32extgather_ps (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
Synopsis
__m512 _mm512_i32extgather_ps (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherdps zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Up-converts 16 memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv to single-precision (32-bit) floating-point elements and stores them in dst.
Operation
FOR j := 0 to 15
addr := MEM[mv + index[j] * scale]
i := j*32
CASE conv OF
_MM_UPCONV_PS_NONE:
dst[i+31:i] := addr[i+31:i]
_MM_UPCONV_PS_FLOAT16:
n := j*16
dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_UINT8:
n := j*8
dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_SINT8:
n := j*8
dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_UINT16:
n := j*16
dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_SINT16:
n := j*16
dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
ESAC
ENDFOR
dst[MAX:512] := 0
vgatherdps
__m512 _mm512_mask_i32extgather_ps (__m512 src, __mmask16 k, __m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
Synopsis
__m512 _mm512_mask_i32extgather_ps (__m512 src, __mmask16 k, __m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherdps zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Up-converts 16 single-precision (32-bit) memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
addr := MEM[mv + index[j] * scale]
i := j*32
IF k[j]
CASE conv OF
_MM_UPCONV_PS_NONE:
dst[i+31:i] := addr[i+31:i]
_MM_UPCONV_PS_FLOAT16:
n := j*16
dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_UINT8:
n := j*8
dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_SINT8:
n := j*8
dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_UINT16:
n := j*16
dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_SINT16:
n := j*16
dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
ESAC
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpscatterdd
void _mm512_i32extscatter_epi32 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
Synopsis
void _mm512_i32extscatter_epi32 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpscatterdd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Down-converts 16 packed 32-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 15
addr := MEM[mv + index[j] * scale]
i := j*32
CASE conv OF
_MM_DOWNCONV_EPI32_NONE:
addr[i+31:i] := v1[i+31:i]
_MM_DOWNCONV_EPI32_UINT8:
n := j*8
addr[n+7:n] := UInt32ToUInt8(v1[i+31:i])
_MM_DOWNCONV_EPI32_SINT8:
n := j*8
addr[n+7:n] := SInt32ToSInt8(v1[i+31:i])
_MM_DOWNCONV_EPI32_UINT16:
n := j*16
addr[n+15:n] := UInt32ToUInt16(v1[i+31:i])
_MM_DOWNCONV_EPI32_SINT16:
n := j*16
addr[n+15:n] := SInt32ToSInt16(v1[n+15:n])
ESAC
ENDFOR
vpscatterdd
void _mm512_mask_i32extscatter_epi32 (void * mv, __mmask16 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_i32extscatter_epi32 (void * mv, __mmask16 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpscatterdd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Down-converts 16 packed 32-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale. Elements are written using writemask k (elements are only written when the corresponding mask bit is set; otherwise, elements are left unchanged in memory). hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 15
addr := MEM[mv + index[j] * scale]
i := j*32
IF k[j]
CASE conv OF
_MM_DOWNCONV_EPI32_NONE:
addr[i+31:i] := v1[i+31:i]
_MM_DOWNCONV_EPI32_UINT8:
n := j*8
addr[n+7:n] := UInt32ToUInt8(v1[i+31:i])
_MM_DOWNCONV_EPI32_SINT8:
n := j*8
addr[n+7:n] := SInt32ToSInt8(v1[i+31:i])
_MM_DOWNCONV_EPI32_UINT16:
n := j*16
addr[n+15:n] := UInt32ToUInt16(v1[i+31:i])
_MM_DOWNCONV_EPI32_SINT16:
n := j*16
addr[n+15:n] := SInt32ToSInt16(v1[n+15:n])
ESAC
FI
ENDFOR
vscatterdps
void _mm512_i32extscatter_ps (void * mv, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int scale)
Synopsis
void _mm512_i32extscatter_ps (void * mv, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int scale)
#include "immintrin.h"
Instruction: vscatterdps m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Down-converts 16 packed single-precision (32-bit) floating-point elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv.
Operation
FOR j := 0 to 15
addr := MEM[mv + index[j] * scale]
i := j*32
CASE conv OF
_MM_DOWNCONV_PS_NONE:
n := j*32
addr[i+31:i] := v1[n+31:n]
_MM_DOWNCONV_PS_FLOAT16:
i := j*16
addr[i+15:i] := Float32ToFloat16(v1[n+31:n])
_MM_DOWNCONV_PS_UINT8:
i := j*8
addr[i+7:i] := Float32ToUInt8(v1[n+31:n])
_MM_DOWNCONV_PS_SINT8:
i := j*8
addr[i+7:i] := Float32ToSInt8(v1[n+31:n])
_MM_DOWNCONV_PS_UINT16:
i := j*8
addr[i+15:i] := Float32ToUInt16(v1[n+31:n])
_MM_DOWNCONV_PS_SINT16:
i := j*8
addr[i+15:i] := Float32ToSInt16(v1[n+31:n])
ESAC
ENDFOR
vscatterdps
void _mm512_mask_i32extscatter_ps (void * mv, __mmask16 k, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_i32extscatter_ps (void * mv, __mmask16 k, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterdps m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Down-converts 16 packed single-precision (32-bit) floating-point elements in v1 according to conv and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using writemask k (elements are written only when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
addr := MEM[mv + index[j] * scale]
CASE conv OF
_MM_DOWNCONV_PS_NONE:
n := j*32
addr[i+31:i] := v1[n+31:n]
_MM_DOWNCONV_PS_FLOAT16:
i := j*16
addr[i+15:i] := Float32ToFloat16(v1[n+31:n])
_MM_DOWNCONV_PS_UINT8:
i := j*8
addr[i+7:i] := Float32ToUInt8(v1[n+31:n])
_MM_DOWNCONV_PS_SINT8:
i := j*8
addr[i+7:i] := Float32ToSInt8(v1[n+31:n])
_MM_DOWNCONV_PS_UINT16:
i := j*8
addr[i+15:i] := Float32ToUInt16(v1[n+31:n])
_MM_DOWNCONV_PS_SINT16:
i := j*8
addr[i+15:i] := Float32ToSInt16(v1[n+31:n])
ESAC
FI
ENDFOR
vpgatherdd
__m128i _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
Synopsis
__m128i _mm_i32gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd xmm, vm32x, xmm
CPUID Flags: AVX2
Description
Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:128] := 0
Performance
vpgatherdd
__m128i _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
Synopsis
__m128i _mm_mask_i32gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd xmm, vm32x, xmm
CPUID Flags: AVX2
Description
Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
IF mask[i+31]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
mask[i+31] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
Performance
vpgatherdd
__m128i _mm_mmask_i32gather_epi32 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m128i _mm_mmask_i32gather_epi32 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd
CPUID Flags: AVX512VL + AVX512F
Description
Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:4] := 0
dst[MAX:128] := 0
vpgatherdd
__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
Synopsis
__m256i _mm256_i32gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd ymm, vm32x, ymm
CPUID Flags: AVX2
Description
Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:256] := 0
Performance
vpgatherdd
__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
Synopsis
__m256i _mm256_mask_i32gather_epi32 (__m256i src, int const* base_addr, __m256i vindex, __m256i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd ymm, vm32x, ymm
CPUID Flags: AVX2
Description
Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
IF mask[i+31]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
mask[i+31] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
Performance
vpgatherdd
__m256i _mm256_mmask_i32gather_epi32 (__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
Synopsis
__m256i _mm256_mmask_i32gather_epi32 (__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherdd
CPUID Flags: AVX512VL + AVX512F
Description
Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:8] := 0
dst[MAX:256] := 0
vpgatherdd
__m512i _mm512_i32gather_epi32 (__m512i vindex, void const* base_addr, int scale)
Synopsis
__m512i _mm512_i32gather_epi32 (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherdd zmm {k}, vm32z
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:512] := 0
vpgatherdd
__m512i _mm512_mask_i32gather_epi32 (__m512i src, __mmask16 k, __m512i vindex, void const* base_addr, int scale)
Synopsis
__m512i _mm512_mask_i32gather_epi32 (__m512i src, __mmask16 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherdd zmm {k}, vm32z
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:16] := 0
dst[MAX:512] := 0
vpgatherdq
__m128i _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
Synopsis
__m128i _mm_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq xmm, vm64x, xmm
CPUID Flags: AVX2
Description
Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
m := j*32
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:128] := 0
Performance
vpgatherdq
__m128i _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
Synopsis
__m128i _mm_mask_i32gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq xmm, vm64x, xmm
CPUID Flags: AVX2
Description
Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
m := j*32
IF mask[i+63]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
mask[i+63] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
Performance
vpgatherdq
__m128i _mm_mmask_i32gather_epi64 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m128i _mm_mmask_i32gather_epi64 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq
CPUID Flags: AVX512VL + AVX512F
Description
Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
m := j*32
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:2] := 0
dst[MAX:128] := 0
vpgatherdq
__m256i _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
Synopsis
__m256i _mm256_i32gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq ymm, vm64x, ymm
CPUID Flags: AVX2
Description
Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
m := j*32
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:256] := 0
Performance
vpgatherdq
__m256i _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale)
Synopsis
__m256i _mm256_mask_i32gather_epi64 (__m256i src, __int64 const* base_addr, __m128i vindex, __m256i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq ymm, vm64x, ymm
CPUID Flags: AVX2
Description
Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
m := j*32
IF mask[i+63]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
mask[i+63] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
Performance
vpgatherdq
__m256i _mm256_mmask_i32gather_epi64 (__m256i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m256i _mm256_mmask_i32gather_epi64 (__m256i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherdq
CPUID Flags: AVX512VL + AVX512F
Description
Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
m := j*32
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:4] := 0
dst[MAX:256] := 0
vpgatherdq
__m512i _mm512_i32gather_epi64 (__m256i vindex, void const* base_addr, int scale)
Synopsis
__m512i _mm512_i32gather_epi64 (__m256i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, vm32y
CPUID Flags: AVX512F
Description
Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
m := j*32
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:512] := 0
vpgatherdq
__m512i _mm512_mask_i32gather_epi64 (__m512i src, __mmask8 k, __m256i vindex, void const* base_addr, int scale)
Synopsis
__m512i _mm512_mask_i32gather_epi64 (__m512i src, __mmask8 k, __m256i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, vm32y
CPUID Flags: AVX512F
Description
Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
m := j*32
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:8] := 0
dst[MAX:512] := 0
vgatherdpd
__m128d _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
Synopsis
__m128d _mm_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd xmm, vm64x, xmm
CPUID Flags: AVX2
Description
Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
m := j*32
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:128] := 0
Performance
vgatherdpd
__m128d _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
Synopsis
__m128d _mm_mask_i32gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd xmm, vm64x, xmm
CPUID Flags: AVX2
Description
Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
m := j*32
IF mask[i+63]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
mask[i+63] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
Performance
vgatherdpd
__m128d _mm_mmask_i32gather_pd (__m128d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m128d _mm_mmask_i32gather_pd (__m128d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd
CPUID Flags: AVX512VL + AVX512F
Description
Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
m := j*32
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:2] := 0
dst[MAX:128] := 0
vgatherdpd
__m256d _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
Synopsis
__m256d _mm256_i32gather_pd (double const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd ymm, vm64x, ymm
CPUID Flags: AVX2
Description
Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
m := j*32
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:256] := 0
Performance
vgatherdpd
__m256d _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale)
Synopsis
__m256d _mm256_mask_i32gather_pd (__m256d src, double const* base_addr, __m128i vindex, __m256d mask, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd ymm, vm64x, ymm
CPUID Flags: AVX2
Description
Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
m := j*32
IF mask[i+63]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
mask[i+63] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
Performance
vgatherdpd
__m256d _mm256_mmask_i32gather_pd (__m256d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m256d _mm256_mmask_i32gather_pd (__m256d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherdpd
CPUID Flags: AVX512VL + AVX512F
Description
Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
m := j*32
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:4] := 0
dst[MAX:256] := 0
vgatherdpd
__m512d _mm512_i32gather_pd (__m256i vindex, void const* base_addr, int scale)
Synopsis
__m512d _mm512_i32gather_pd (__m256i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, vm32y
CPUID Flags: AVX512F
Description
Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
m := j*32
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_mask_i32gather_pd (__m512d src, __mmask8 k, __m256i vindex, void const* base_addr, int scale)
Synopsis
__m512d _mm512_mask_i32gather_pd (__m512d src, __mmask8 k, __m256i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, vm32y
CPUID Flags: AVX512F
Description
Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
m := j*32
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:8] := 0
dst[MAX:512] := 0
vgatherdps
__m128 _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale)
Synopsis
__m128 _mm_i32gather_ps (float const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherdps xmm, vm32x, xmm
CPUID Flags: AVX2
Description
Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:128] := 0
Performance
vgatherdps
__m128 _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
Synopsis
__m128 _mm_mask_i32gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
#include "immintrin.h"
Instruction: vgatherdps xmm, vm32x, xmm
CPUID Flags: AVX2
Description
Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
IF mask[i+31]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
mask[i+31] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
Performance
vgatherdps
__m128 _mm_mmask_i32gather_ps (__m128 src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m128 _mm_mmask_i32gather_ps (__m128 src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherdps
CPUID Flags: AVX512VL + AVX512F
Description
Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:4] := 0
dst[MAX:128] := 0
vgatherdps
__m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale)
Synopsis
__m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherdps ymm, vm32x, ymm
CPUID Flags: AVX2
Description
Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:256] := 0
Performance
vgatherdps
__m256 _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale)
Synopsis
__m256 _mm256_mask_i32gather_ps (__m256 src, float const* base_addr, __m256i vindex, __m256 mask, const int scale)
#include "immintrin.h"
Instruction: vgatherdps ymm, vm32x, ymm
CPUID Flags: AVX2
Description
Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
IF mask[i+31]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
mask[i+31] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
Performance
vgatherdps
__m256 _mm256_mmask_i32gather_ps (__m256 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
Synopsis
__m256 _mm256_mmask_i32gather_ps (__m256 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherdps
CPUID Flags: AVX512VL + AVX512F
Description
Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:8] := 0
dst[MAX:256] := 0
vgatherdps
__m512 _mm512_i32gather_ps (__m512i vindex, void const* base_addr, int scale)
Synopsis
__m512 _mm512_i32gather_ps (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherdps zmm {k}, vm32z
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:512] := 0
vgatherdps
__m512 _mm512_mask_i32gather_ps (__m512 src, __mmask16 k, __m512i vindex, void const* base_addr, int scale)
Synopsis
__m512 _mm512_mask_i32gather_ps (__m512 src, __mmask16 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherdps zmm {k}, vm32z
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:16] := 0
dst[MAX:512] := 0
vpgatherdq
__m512i _mm512_i32loextgather_epi64 (__m512i index, void const * mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
Synopsis
__m512i _mm512_i32loextgather_epi64 (__m512i index, void const * mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Up-converts 8 double-precision (64-bit) memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale using conv to 64-bit integer elements and stores them in dst.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i]
ESAC
ENDFOR
dst[MAX:512] := 0
vpgatherdq
__m512i _mm512_mask_i32loextgather_epi64 (__m512i src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
Synopsis
__m512i _mm512_mask_i32loextgather_epi64 (__m512i src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Up-converts 8 double-precision (64-bit) memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale using conv to 64-bit integer elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
IF k[j]
CASE conv OF
_MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i]
ESAC
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_i32loextgather_pd (__m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
Synopsis
__m512d _mm512_i32loextgather_pd (__m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale using conv to 64-bit floating-point elements and stores them in dst.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i]
ESAC
ENDFOR
dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_mask_i32loextgather_pd (__m512d src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
Synopsis
__m512d _mm512_mask_i32loextgather_pd (__m512d src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Up-converts 8 double-precision (64-bit) floating-point elements in memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale using conv to 64-bit floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
IF k[j]
CASE conv OF
_MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i]
ESAC
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpscatterdq
void _mm512_i32loextscatter_epi64 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
Synopsis
void _mm512_i32loextscatter_epi64 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpscatterdq m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Down-converts 8 packed 64-bit integer elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i]
ESAC
ENDFOR
vpscatterdq
void _mm512_mask_i32loextscatter_epi64 (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_i32loextscatter_epi64 (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vpscatterdq m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Down-converts 8 packed 64-bit integer elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv. Only those elements whose corresponding mask bit is set in writemask k are written to memory.
Operation
FOR j := 0 to 7
IF k[j]
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i]
ESAC
FI
ENDFOR
vscatterdpd
void _mm512_i32loextscatter_pd (void * mv, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
Synopsis
void _mm512_i32loextscatter_pd (void * mv, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterdpd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Down-converts 8 packed double-precision (64-bit) floating-point elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_DOWNCONV_PD_NONE: addr[i+63:i] := v1[i+63:i]
ESAC
ENDFOR
vscatterdpd
void _mm512_mask_i32loextscatter_pd (void * mv, __mmask8 k, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_i32loextscatter_pd (void * mv, __mmask8 k, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterdpd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Down-converts 8 packed double-precision (64-bit) floating-point elements in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using conv. Only those elements whose corresponding mask bit is set in writemask k are written to memory.
Operation
FOR j := 0 to 7
IF k[j]
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_DOWNCONV_PD_NONE: addr[i+63:i] := v1[i+63:i]
ESAC
FI
ENDFOR
vpgatherdq
__m512i _mm512_i32logather_epi64 (__m512i index, void const* mv, int scale)
Synopsis
__m512i _mm512_i32logather_epi64 (__m512i index, void const* mv, int scale)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Loads 8 64-bit integer elements from memory starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale and stores them in dst.
Operation
FOR j := 0 to 7
i := j*64
addr := MEM[mv + index[j] * scale]
dst[i+63:i] := addr[i+63:i]
ENDFOR
dst[MAX:512] := 0
vpgatherdq
__m512i _mm512_mask_i32logather_epi64 (__m512i src, __mmask8 k, __m512i index, void const* mv, int scale)
Synopsis
__m512i _mm512_mask_i32logather_epi64 (__m512i src, __mmask8 k, __m512i index, void const* mv, int scale)
#include "immintrin.h"
Instruction: vpgatherdq zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Loads 8 64-bit integer elements from memory starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
addr := MEM[mv + index[j] * scale]
dst[i+63:i] := addr[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_i32logather_pd (__m512i index, void const* mv, int scale)
Synopsis
__m512d _mm512_i32logather_pd (__m512i index, void const* mv, int scale)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Loads 8 double-precision (64-bit) floating-point elements stored at memory locations starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale them in dst.
Operation
FOR j := 0 to 7
i := j*64
addr := MEM[mv + index[j] * scale]
dst[i+63:i] := addr[i+63:i]
ENDFOR
dst[MAX:512] := 0
vgatherdpd
__m512d _mm512_mask_i32logather_pd (__m512d src, __mmask8 k, __m512i index, void const* mv, int scale)
Synopsis
__m512d _mm512_mask_i32logather_pd (__m512d src, __mmask8 k, __m512i index, void const* mv, int scale)
#include "immintrin.h"
Instruction: vgatherdpd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Loads 8 double-precision (64-bit) floating-point elements from memory starting at location mv at packed 32-bit integer indices stored in the lower half of index scaled by scale into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
addr := MEM[mv + index[j] * scale]
dst[i+63:i] := addr[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpscatterdq
void _mm512_i32loscatter_epi64 (void* mv, __m512i index, __m512i v1, int scale)
Synopsis
void _mm512_i32loscatter_epi64 (void* mv, __m512i index, __m512i v1, int scale)
#include "immintrin.h"
Instruction: vpscatterdq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores 8 packed 64-bit integer elements located in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
addr[i+63:i] := v1[k+63:j]
ENDFOR
vpscatterdq
void _mm512_mask_i32loscatter_epi64 (void* mv, __mmask8 k, __m512i index, __m512i v1, int scale)
Synopsis
void _mm512_mask_i32loscatter_epi64 (void* mv, __mmask8 k, __m512i index, __m512i v1, int scale)
#include "immintrin.h"
Instruction: vpscatterdq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores 8 packed 64-bit integer elements located in v1 and stores them in memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale using writemask k (elements whose corresponding mask bit is not set are not written to memory).
Operation
FOR j := 0 to 7
IF k[j]
addr := MEM[mv + index[j] * scale]
addr[i+63:i] := v1[i+63:i]
FI
ENDFOR
vscatterdpd
void _mm512_i32loscatter_pd (void* mv, __m512i index, __m512d v1, int scale)
Synopsis
void _mm512_i32loscatter_pd (void* mv, __m512i index, __m512d v1, int scale)
#include "immintrin.h"
Instruction: vscatterdpd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Stores 8 packed double-precision (64-bit) floating-point elements in v1 and to memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
addr[i+63:i] := v1[k+63:j]
ENDFOR
vscatterdpd
void _mm512_mask_i32loscatter_pd (void* mv, __mmask8 k, __m512i index, __m512d v1, int scale)
Synopsis
void _mm512_mask_i32loscatter_pd (void* mv, __mmask8 k, __m512i index, __m512d v1, int scale)
#include "immintrin.h"
Instruction: vscatterdpd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Stores 8 packed double-precision (64-bit) floating-point elements in v1 to memory locations starting at location mv at packed 32-bit integer indices stored in index scaled by scale. Only those elements whose corresponding mask bit is set in writemask k are written to memory.
Operation
FOR j := 0 to 7
IF k[j]
addr := MEM[mv + index[j] * scale]
i := j*64
addr[i+63:i] := v1[k+63:j]
FI
ENDFOR
vpscatterdd
void _mm_i32scatter_epi32 (void* base_addr, __m128i vindex, __m128i a, const int scale)
Synopsis
void _mm_i32scatter_epi32 (void* base_addr, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
ENDFOR
vpscatterdd
void _mm_mask_i32scatter_epi32 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
Synopsis
void _mm_mask_i32scatter_epi32 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpscatterdd
void _mm256_i32scatter_epi32 (void* base_addr, __m256i vindex, __m256i a, const int scale)
Synopsis
void _mm256_i32scatter_epi32 (void* base_addr, __m256i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
ENDFOR
vpscatterdd
void _mm256_mask_i32scatter_epi32 (void* base_addr, __mmask8 k, __m256i vindex, __m256i a, const int scale)
Synopsis
void _mm256_mask_i32scatter_epi32 (void* base_addr, __mmask8 k, __m256i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpscatterdd
void _mm512_i32scatter_epi32 (void* base_addr, __m512i vindex, __m512i a, int scale)
Synopsis
void _mm512_i32scatter_epi32 (void* base_addr, __m512i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterdd vm32z {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 15
i := j*32
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
ENDFOR
vpscatterdd
void _mm512_mask_i32scatter_epi32 (void* base_addr, __mmask16 k, __m512i vindex, __m512i a, int scale)
Synopsis
void _mm512_mask_i32scatter_epi32 (void* base_addr, __mmask16 k, __m512i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterdd vm32z {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpscatterdq
void _mm_i32scatter_epi64 (void* base_addr, __m128i vindex, __m128i a, const int scale)
Synopsis
void _mm_i32scatter_epi64 (void* base_addr, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdq
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
ENDFOR
vpscatterdq
void _mm_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
Synopsis
void _mm_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdq
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpscatterdq
void _mm256_i32scatter_epi64 (void* base_addr, __m128i vindex, __m256i a, const int scale)
Synopsis
void _mm256_i32scatter_epi64 (void* base_addr, __m128i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdq
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
ENDFOR
vpscatterdq
void _mm256_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m256i a, const int scale)
Synopsis
void _mm256_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterdq
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpscatterdq
void _mm512_i32scatter_epi64 (void* base_addr, __m256i vindex, __m512i a, int scale)
Synopsis
void _mm512_i32scatter_epi64 (void* base_addr, __m256i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterdq vz32y {k}, zmm
CPUID Flags: AVX512F
Description
Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
ENDFOR
vpscatterdq
void _mm512_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m256i vindex, __m512i a, int scale)
Synopsis
void _mm512_mask_i32scatter_epi64 (void* base_addr, __mmask8 k, __m256i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterdq vz32y {k}, zmm
CPUID Flags: AVX512F
Description
Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vscatterdpd
void _mm_i32scatter_pd (void* base_addr, __m128i vindex, __m128d a, const int scale)
Synopsis
void _mm_i32scatter_pd (void* base_addr, __m128i vindex, __m128d a, const int scale)
#include "immintrin.h"
Instruction: vscatterdpd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
ENDFOR
vscatterdpd
void _mm_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m128d a, const int scale)
Synopsis
void _mm_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m128d a, const int scale)
#include "immintrin.h"
Instruction: vscatterdpd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
l := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vscatterdpd
void _mm256_i32scatter_pd (void* base_addr, __m128i vindex, __m256d a, const int scale)
Synopsis
void _mm256_i32scatter_pd (void* base_addr, __m128i vindex, __m256d a, const int scale)
#include "immintrin.h"
Instruction: vscatterdpd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
ENDFOR
vscatterdpd
void _mm256_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m256d a, const int scale)
Synopsis
void _mm256_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m256d a, const int scale)
#include "immintrin.h"
Instruction: vscatterdpd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
l := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vscatterdpd
void _mm512_i32scatter_pd (void* base_addr, __m256i vindex, __m512d a, int scale)
Synopsis
void _mm512_i32scatter_pd (void* base_addr, __m256i vindex, __m512d a, int scale)
#include "immintrin.h"
Instruction: vscatterdpd vm32y {k}, zmm
CPUID Flags: AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
ENDFOR
vscatterdpd
void _mm512_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m256i vindex, __m512d a, int scale)
Synopsis
void _mm512_mask_i32scatter_pd (void* base_addr, __mmask8 k, __m256i vindex, __m512d a, int scale)
#include "immintrin.h"
Instruction: vscatterdpd vm32y {k}, zmm
CPUID Flags: AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
l := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[l+31:l])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vscatterdps
void _mm_i32scatter_ps (void* base_addr, __m128i vindex, __m128 a, const int scale)
Synopsis
void _mm_i32scatter_ps (void* base_addr, __m128i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterdps
CPUID Flags: AVX512VL + AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
ENDFOR
vscatterdps
void _mm_mask_i32scatter_ps (void* base_addr, __mmask8 k, __m128i vindex, __m128 a, const int scale)
Synopsis
void _mm_mask_i32scatter_ps (void* base_addr, __mmask8 k, __m128i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterdps
CPUID Flags: AVX512VL + AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vscatterdps
void _mm256_i32scatter_ps (void* base_addr, __m256i vindex, __m256 a, const int scale)
Synopsis
void _mm256_i32scatter_ps (void* base_addr, __m256i vindex, __m256 a, const int scale)
#include "immintrin.h"
Instruction: vscatterdps
CPUID Flags: AVX512VL + AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
ENDFOR
vscatterdps
void _mm256_mask_i32scatter_ps (void* base_addr, __mmask8 k, __m256i vindex, __m256 a, const int scale)
Synopsis
void _mm256_mask_i32scatter_ps (void* base_addr, __mmask8 k, __m256i vindex, __m256 a, const int scale)
#include "immintrin.h"
Instruction: vscatterdps
CPUID Flags: AVX512VL + AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vscatterdps
void _mm512_i32scatter_ps (void* base_addr, __m512i vindex, __m512 a, int scale)
Synopsis
void _mm512_i32scatter_ps (void* base_addr, __m512i vindex, __m512 a, int scale)
#include "immintrin.h"
Instruction: vscatterdps vm32z {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 15
i := j*32
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
ENDFOR
vscatterdps
void _mm512_mask_i32scatter_ps (void* base_addr, __mmask16 k, __m512i vindex, __m512 a, int scale)
Synopsis
void _mm512_mask_i32scatter_ps (void* base_addr, __mmask16 k, __m512i vindex, __m512 a, int scale)
#include "immintrin.h"
Instruction: vscatterdps vm32z {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
MEM[base_addr + SignExtend(vindex[i+31:i])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
...
__m512i _mm512_i64extgather_epi32lo (__m512i index, void const* mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
Synopsis
__m512i _mm512_i64extgather_epi32lo (__m512i index, void const* mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Up-converts 8 single-precision (32-bit) memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 32-bit integer elements and stores them in dst. hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*32
CASE conv OF
_MM_UPCONV_EPI32_NONE:
dst[i+31:i] := addr[i+31:i]
_MM_UPCONV_EPI32_UINT8:
n := j*8
dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_SINT8:
n := j*8
dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_UINT16:
n := j*16
dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
_MM_UPCONV_EPI32_SINT16:
n := j*16
dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
ESAC
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_mask_i64extgather_epi32lo (__m512i src, __mmask8 k, __m512i index, void const* mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
Synopsis
__m512i _mm512_mask_i64extgather_epi32lo (__m512i src, __mmask8 k, __m512i index, void const* mv, _MM_UPCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Up-converts 8 single-precision (32-bit) memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 32-bit integer elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*32
IF k[j]
CASE conv OF
_MM_UPCONV_EPI32_NONE:
dst[i+31:i] := addr[i+31:i]
_MM_UPCONV_EPI32_UINT8:
n := j*8
dst[i+31:i] := UInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_SINT8:
n := j*8
dst[i+31:i] := SInt8ToInt32(addr[n+7:n])
_MM_UPCONV_EPI32_UINT16:
n := j*16
dst[i+31:i] := UInt16ToInt32(addr[n+15:n])
_MM_UPCONV_EPI32_SINT16:
n := j*16
dst[i+31:i] := SInt16ToInt32(addr[n+15:n])
ESAC
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_i64extgather_epi64 (__m512i index, void const* mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
Synopsis
__m512i _mm512_i64extgather_epi64 (__m512i index, void const* mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Up-converts 8 double-precision (64-bit) memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 64-bit integer elements and stores them in dst. hint indicates to the processor whether the load is non-temporal.
Operation
FOR j := 0 to 7
i := j*64
addr := MEM[mv + index[j] * scale]
CASE conv OF
_MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i]
ESAC
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_mask_i64extgather_epi64 (__m512i src, __mmask8 k, __m512i index, void const* mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
Synopsis
__m512i _mm512_mask_i64extgather_epi64 (__m512i src, __mmask8 k, __m512i index, void const* mv, _MM_UPCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Up-converts 8 double-precision (64-bit) memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 64-bit integer elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the load is non-temporal.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
addr := MEM[mv + index[j] * scale]
CASE conv OF
_MM_UPCONV_EPI64_NONE: dst[i+63:i] := addr[i+63:i]
ESAC
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_i64extgather_pd (__m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
Synopsis
__m512d _mm512_i64extgather_pd (__m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 64-bit floating-point elements and stores them in dst. hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i]
ESAC
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_i64extgather_pd (__m512d src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
Synopsis
__m512d _mm512_mask_i64extgather_pd (__m512d src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Up-converts 8 double-precision (64-bit) floating-point elements stored in memory starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to 64-bit floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
IF k[j]
CASE conv OF
_MM_UPCONV_PD_NONE: dst[i+63:i] := addr[i+63:i]
ESAC
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_i64extgather_pslo (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
Synopsis
__m512 _mm512_i64extgather_pslo (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Up-converts 8 memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to single-precision (32-bit) floating-point elements and stores them in the lower half of dst. hint indicates to the processor whether the load is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*32
CASE conv OF
_MM_UPCONV_PS_NONE:
dst[i+31:i] := addr[i+31:i]
_MM_UPCONV_PS_FLOAT16:
n := j*16
dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_UINT8:
n := j*8
dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_SINT8:
n := j*8
dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_UINT16:
n := j*16
dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_SINT16:
n := j*16
dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
ESAC
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_i64extgather_pslo (__m512 src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
Synopsis
__m512 _mm512_mask_i64extgather_pslo (__m512 src, __mmask8 k, __m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Up-converts 8 memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using conv to single-precision (32-bit) floating-point elements and stores them in the lower half of dst using writemask k (elements are copied from src when the corresponding mask bit is not set). hint indicates to the processor whether the load is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*32
IF k[j]
CASE conv OF
_MM_UPCONV_PS_NONE:
dst[i+31:i] := addr[i+31:i]
_MM_UPCONV_PS_FLOAT16:
n := j*16
dst[i+31:i] := Float16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_UINT8:
n := j*8
dst[i+31:i] := UInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_SINT8:
n := j*8
dst[i+31:i] := SInt8ToFloat32(addr[n+7:n])
_MM_UPCONV_PS_UINT16:
n := j*16
dst[i+31:i] := UInt16ToFloat32(addr[n+15:n])
_MM_UPCONV_PS_SINT16:
n := j*16
dst[i+31:i] := SInt16ToFloat32(addr[n+15:n])
ESAC
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
...
void _mm512_i64extscatter_epi32lo (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
Synopsis
void _mm512_i64extscatter_epi32lo (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Down-converts the low 8 packed 32-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_DOWNCONV_EPI32_NONE:
addr[i+31:i] := v1[i+31:i]
_MM_DOWNCONV_EPI32_UINT8:
n := j*8
addr[n+7:n] := UInt32ToUInt8(v1[i+31:i])
_MM_DOWNCONV_EPI32_SINT8:
n := j*8
addr[n+7:n] := SInt32ToSInt8(v1[i+31:i])
_MM_DOWNCONV_EPI32_UINT16:
n := j*16
addr[n+15:n] := UInt32ToUInt16(v1[i+31:i])
_MM_DOWNCONV_EPI32_SINT16:
n := j*16
addr[n+15:n] := SInt32ToSInt16(v1[n+15:n])
ESAC
ENDFOR
...
void _mm512_mask_i64extscatter_epi32lo (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_i64extscatter_epi32lo (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI32_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Down-converts the low 8 packed 32-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. Elements are written to memory using writemask k (elements are only written when the corresponding mask bit is set; otherwise, the memory location is left unchanged). hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
IF k[j]
CASE conv OF
_MM_DOWNCONV_EPI32_NONE:
addr[i+31:i] := v1[i+31:i]
_MM_DOWNCONV_EPI32_UINT8:
n := j*8
addr[n+7:n] := UInt32ToUInt8(v1[i+31:i])
_MM_DOWNCONV_EPI32_SINT8:
n := j*8
addr[n+7:n] := SInt32ToSInt8(v1[i+31:i])
_MM_DOWNCONV_EPI32_UINT16:
n := j*16
addr[n+15:n] := UInt32ToUInt16(v1[i+31:i])
_MM_DOWNCONV_EPI32_SINT16:
n := j*16
addr[n+15:n] := SInt32ToSInt16(v1[n+15:n])
ESAC
FI
ENDFOR
...
void _mm512_i64extscatter_epi64 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
Synopsis
void _mm512_i64extscatter_epi64 (void * mv, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Down-converts 8 packed 64-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the load is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i]
ESAC
ENDFOR
...
void _mm512_mask_i64extscatter_epi64 (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_i64extscatter_epi64 (void * mv, __mmask8 k, __m512i index, __m512i v1, _MM_DOWNCONV_EPI64_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Down-converts 8 packed 64-bit integer elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. Only those elements whose corresponding mask bit is set in writemask k are written to memory.
Operation
FOR j := 0 to 7
IF k[j]
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_DOWNCONV_EPI64_NONE: addr[i+63:i] := v1[i+63:i]
ESAC
FI
ENDFOR
...
void _mm512_i64extscatter_pd (void * mv, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
Synopsis
void _mm512_i64extscatter_pd (void * mv, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Down-converts 8 packed double-precision (64-bit) floating-point elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
CASE conv OF
_MM_DOWNCONV_EPI64_NONE:
addr[i+63:i] := v1[i+63:i]
ESAC
ENDFOR
...
void _mm512_mask_i64extscatter_pd (void * mv, __mmask8 k, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_i64extscatter_pd (void * mv, __mmask8 k, __m512i index, __m512d v1, _MM_DOWNCONV_PD_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Down-converts 8 packed double-precision (64-bit) floating-point elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. Elements are written to memory using writemask k (elements are not stored to memory when the corresponding mask bit is not set; the memory location is left unchagned). hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*64
IF k[j]
CASE conv OF
_MM_DOWNCONV_EPI64_NONE:
addr[i+63:i] := v1[i+63:i]
ESAC
FI
ENDFOR
...
void _mm512_i64extscatter_pslo (void * mv, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)
Synopsis
void _mm512_i64extscatter_pslo (void * mv, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Down-converts 8 packed single-precision (32-bit) floating-point elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*32
CASE conv OF
_MM_DOWNCONV_PS_NONE:
addr[i+31:i] := v1[i+31:i]
_MM_DOWNCONV_PS_FLOAT16:
n := j*16
addr[n+15:n] := Float32ToFloat16(v1[i+31:i])
_MM_DOWNCONV_PS_UINT8:
n := j*8
addr[n+7:n] := Float32ToUInt8(v1[i+31:i])
_MM_DOWNCONV_PS_SINT8:
n := j*8
addr[n+7:n] := Float32ToSInt8(v1[i+31:i])
_MM_DOWNCONV_PS_UINT16:
n := j*16
addr[n+15:n] := Float32ToUInt16(v1[i+31:i])
_MM_DOWNCONV_PS_SINT16:
n := j*16
addr[n+15:n] := Float32ToSInt16(v1[i+31:i])
ESAC
ENDFOR
...
void _mm512_mask_i64extscatter_pslo (void * mv, __mmask8 k, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_i64extscatter_pslo (void * mv, __mmask8 k, __m512i index, __m512 v1, _MM_DOWNCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Down-converts 8 packed single-precision (32-bit) floating-point elements in v1 using conv and stores them in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale. Elements are only written when the corresponding mask bit is set in k; otherwise, elements are unchanged in memory. hint indicates to the processor whether the data is non-temporal.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*32
IF k[j]
CASE conv OF
_MM_DOWNCONV_PS_NONE:
addr[i+31:i] := v[i+31:i]
_MM_DOWNCONV_PS_FLOAT16:
n := j*16
addr[n+15:n] := Float32ToFloat16(v1[i+31:i])
_MM_DOWNCONV_PS_UINT8:
n := j*8
addr[n+7:n] := Float32ToUInt8(v1[i+31:i])
_MM_DOWNCONV_PS_SINT8:
n := j*8
addr[n+7:n] := Float32ToSInt8(v1[i+31:i])
_MM_DOWNCONV_PS_UINT16:
n := j*16
addr[n+15:n] := Float32ToUInt16(v1[i+31:i])
_MM_DOWNCONV_PS_SINT16:
n := j*16
addr[n+15:n] := Float32ToSInt16(v1[i+31:i])
ESAC
FI
ENDFOR
vpgatherqd
__m128i _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
Synopsis
__m128i _mm_i64gather_epi32 (int const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd xmm, vm32x, xmm
CPUID Flags: AVX2
Description
Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
m := j*64
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:64] := 0
Performance
vpgatherqd
__m128i _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
Synopsis
__m128i _mm_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m128i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd xmm, vm32x, xmm
CPUID Flags: AVX2
Description
Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
m := j*64
IF mask[i+31]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
mask[i+31] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
mask[MAX:64] := 0
dst[MAX:64] := 0
Performance
vpgatherqd
__m128i _mm_mmask_i64gather_epi32 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m128i _mm_mmask_i64gather_epi32 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd
CPUID Flags: AVX512VL + AVX512F
Description
Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
m := j*64
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:2] := 0
dst[MAX:64] := 0
vpgatherqd
__m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
Synopsis
__m128i _mm256_i64gather_epi32 (int const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd ymm, vm32x, ymm
CPUID Flags: AVX2
Description
Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
m := j*64
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:128] := 0
vpgatherqd
__m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale)
Synopsis
__m128i _mm256_mask_i64gather_epi32 (__m128i src, int const* base_addr, __m256i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd ymm, vm32x, ymm
CPUID Flags: AVX2
Description
Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
m := j*64
IF mask[i+31]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
mask[i+31] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
vpgatherqd
__m128i _mm256_mmask_i64gather_epi32 (__m128i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
Synopsis
__m128i _mm256_mmask_i64gather_epi32 (__m128i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherqd
CPUID Flags: AVX512VL + AVX512F
Description
Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
m := j*64
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:4] := 0
dst[MAX:128] := 0
vpgatherqd
__m256i _mm512_i64gather_epi32 (__m512i vindex, void const* base_addr, int scale)
Synopsis
__m256i _mm512_i64gather_epi32 (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherqd ymm {k}, vm64z
CPUID Flags: AVX512F
Description
Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
m := j*64
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:256] := 0
vpgatherqd
__m256i _mm512_mask_i64gather_epi32 (__m256i src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
Synopsis
__m256i _mm512_mask_i64gather_epi32 (__m256i src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherqd ymm {k}, vm64z
CPUID Flags: AVX512F
Description
Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
m := j*64
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:8] := 0
dst[MAX:256] := 0
...
__m512i _mm512_i64gather_epi32lo (__m512i index, void const * mv, int scale)
Synopsis
__m512i _mm512_i64gather_epi32lo (__m512i index, void const * mv, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Loads 8 32-bit integer memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale to dst.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*32
dst[i+31:i] := addr[i+31:i]
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_mask_i64gather_epi32lo (__m512i src, __mmask8 k, __m512i index, void const * mv, int scale)
Synopsis
__m512i _mm512_mask_i64gather_epi32lo (__m512i src, __mmask8 k, __m512i index, void const * mv, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Loads 8 32-bit integer memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
addr := MEM[mv + index[j] * scale]
dst[i+31:i] := addr[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpgatherqq
__m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
Synopsis
__m128i _mm_i64gather_epi64 (__int64 const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq xmm, vm64x, xmm
CPUID Flags: AVX2
Description
Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:128] := 0
Performance
vpgatherqq
__m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
Synopsis
__m128i _mm_mask_i64gather_epi64 (__m128i src, __int64 const* base_addr, __m128i vindex, __m128i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq xmm, vm64x, xmm
CPUID Flags: AVX2
Description
Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
IF mask[i+63]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
mask[i+63] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
Performance
vpgatherqq
__m128i _mm_mmask_i64gather_epi64 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m128i _mm_mmask_i64gather_epi64 (__m128i src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq
CPUID Flags: AVX512VL + AVX512F
Description
Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:2] := 0
dst[MAX:128] := 0
vpgatherqq
__m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale)
Synopsis
__m256i _mm256_i64gather_epi64 (__int64 const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq ymm, vm64x, ymm
CPUID Flags: AVX2
Description
Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:256] := 0
Performance
vpgatherqq
__m256i _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale)
Synopsis
__m256i _mm256_mask_i64gather_epi64 (__m256i src, __int64 const* base_addr, __m256i vindex, __m256i mask, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq ymm, vm64x, ymm
CPUID Flags: AVX2
Description
Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
IF mask[i+63]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
mask[i+63] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
Performance
vpgatherqq
__m256i _mm256_mmask_i64gather_epi64 (__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
Synopsis
__m256i _mm256_mmask_i64gather_epi64 (__m256i src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vpgatherqq
CPUID Flags: AVX512VL + AVX512F
Description
Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:4] := 0
dst[MAX:256] := 0
vpgatherqq
__m512i _mm512_i64gather_epi64 (__m512i vindex, void const* base_addr, int scale)
Synopsis
__m512i _mm512_i64gather_epi64 (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherqq zmm {k}, vm64z
CPUID Flags: AVX512F
Description
Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:512] := 0
vpgatherqq
__m512i _mm512_mask_i64gather_epi64 (__m512i src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
Synopsis
__m512i _mm512_mask_i64gather_epi64 (__m512i src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vpgatherqq zmm {k}, vm64z
CPUID Flags: AVX512F
Description
Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:8] := 0
dst[MAX:512] := 0
vgatherqpd
__m128d _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale)
Synopsis
__m128d _mm_i64gather_pd (double const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd xmm, vm64x, xmm
CPUID Flags: AVX2
Description
Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:128] := 0
Performance
vgatherqpd
__m128d _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
Synopsis
__m128d _mm_mask_i64gather_pd (__m128d src, double const* base_addr, __m128i vindex, __m128d mask, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd xmm, vm64x, xmm
CPUID Flags: AVX2
Description
Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
IF mask[i+63]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
mask[i+63] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
Performance
vgatherqpd
__m128d _mm_mmask_i64gather_pd (__m128d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m128d _mm_mmask_i64gather_pd (__m128d src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd
CPUID Flags: AVX512VL + AVX512F
Description
Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:2] := 0
dst[MAX:128] := 0
vgatherqpd
__m256d _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale)
Synopsis
__m256d _mm256_i64gather_pd (double const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd ymm, vm64x, ymm
CPUID Flags: AVX2
Description
Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:256] := 0
Performance
vgatherqpd
__m256d _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale)
Synopsis
__m256d _mm256_mask_i64gather_pd (__m256d src, double const* base_addr, __m256i vindex, __m256d mask, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd ymm, vm64x, ymm
CPUID Flags: AVX2
Description
Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
IF mask[i+63]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
mask[i+63] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0
Performance
vgatherqpd
__m256d _mm256_mmask_i64gather_pd (__m256d src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
Synopsis
__m256d _mm256_mmask_i64gather_pd (__m256d src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherqpd
CPUID Flags: AVX512VL + AVX512F
Description
Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:4] := 0
dst[MAX:256] := 0
vgatherqpd
__m512d _mm512_i64gather_pd (__m512i vindex, void const* base_addr, int scale)
Synopsis
__m512d _mm512_i64gather_pd (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherqpd zmm {k}, vm32z
CPUID Flags: AVX512F
Description
Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:512] := 0
vgatherqpd
__m512d _mm512_mask_i64gather_pd (__m512d src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
Synopsis
__m512d _mm512_mask_i64gather_pd (__m512d src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherqpd zmm {k}, vm32z
CPUID Flags: AVX512F
Description
Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
k[MAX:8] := 0
dst[MAX:512] := 0
vgatherqps
__m128 _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale)
Synopsis
__m128 _mm_i64gather_ps (float const* base_addr, __m128i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherqps xmm, vm32x, xmm
CPUID Flags: AVX2
Description
Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
m := j*64
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:64] := 0
Performance
vgatherqps
__m128 _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
Synopsis
__m128 _mm_mask_i64gather_ps (__m128 src, float const* base_addr, __m128i vindex, __m128 mask, const int scale)
#include "immintrin.h"
Instruction: vgatherqps xmm, vm32x, xmm
CPUID Flags: AVX2
Description
Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
m := j*64
IF mask[i+31]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
mask[i+31] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
mask[MAX:64] := 0
dst[MAX:64] := 0
Performance
vgatherqps
__m128 _mm_mmask_i64gather_ps (__m128 src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
Synopsis
__m128 _mm_mmask_i64gather_ps (__m128 src, __mmask8 k, __m128i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherqps
CPUID Flags: AVX512VL + AVX512F
Description
Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
m := j*64
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:2] := 0
dst[MAX:64] := 0
vgatherqps
__m128 _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale)
Synopsis
__m128 _mm256_i64gather_ps (float const* base_addr, __m256i vindex, const int scale)
#include "immintrin.h"
Instruction: vgatherqps ymm, vm32x, ymm
CPUID Flags: AVX2
Description
Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
m := j*64
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:128] := 0
Performance
vgatherqps
__m128 _mm256_mask_i64gather_ps (__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale)
Synopsis
__m128 _mm256_mask_i64gather_ps (__m128 src, float const* base_addr, __m256i vindex, __m128 mask, const int scale)
#include "immintrin.h"
Instruction: vgatherqps ymm, vm32x, ymm
CPUID Flags: AVX2
Description
Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using mask (elements are copied from src when the highest bit is not set in the corresponding element). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
m := j*64
IF mask[i+31]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
mask[i+31] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0
Performance
vgatherqps
__m128 _mm256_mmask_i64gather_ps (__m128 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
Synopsis
__m128 _mm256_mmask_i64gather_ps (__m128 src, __mmask8 k, __m256i vindex, void const* base_addr, const int scale)
#include "immintrin.h"
Instruction: vgatherqps
CPUID Flags: AVX512VL + AVX512F
Description
Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
m := j*64
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:4] := 0
dst[MAX:128] := 0
vgatherqps
__m256 _mm512_i64gather_ps (__m512i vindex, void const* base_addr, int scale)
Synopsis
__m256 _mm512_i64gather_ps (__m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherqps ymm {k}, vm64z
CPUID Flags: AVX512F
Description
Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
m := j*64
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:256] := 0
vgatherqps
__m256 _mm512_mask_i64gather_ps (__m256 src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
Synopsis
__m256 _mm512_mask_i64gather_ps (__m256 src, __mmask8 k, __m512i vindex, void const* base_addr, int scale)
#include "immintrin.h"
Instruction: vgatherqps ymm {k}, vm64z
CPUID Flags: AVX512F
Description
Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
m := j*64
IF k[j]
dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
k[j] := 0
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
k[MAX:8] := 0
dst[MAX:256] := 0
...
__m512 _mm512_i64gather_pslo (__m512i index, void const * mv, int scale)
Synopsis
__m512 _mm512_i64gather_pslo (__m512i index, void const * mv, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Loads 8 single-precision (32-bit) floating-point memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale to dst.
Operation
FOR j := 0 to 7
addr := MEM[mv + index[j] * scale]
i := j*32
dst[i+31:i] := addr[i+31:i]
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_i64gather_pslo (__m512 src, __mmask8 k, __m512i index, void const * mv, int scale)
Synopsis
__m512 _mm512_mask_i64gather_pslo (__m512 src, __mmask8 k, __m512i index, void const * mv, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Loads 8 single-precision (32-bit) floating-point memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
addr := MEM[mv + index[j] * scale]
dst[i+31:i] := addr[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpscatterqd
void _mm_i64scatter_epi32 (void* base_addr, __m128i vindex, __m128i a, const int scale)
Synopsis
void _mm_i64scatter_epi32 (void* base_addr, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
l := j*64
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
ENDFOR
vpscatterqd
void _mm_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
Synopsis
void _mm_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
l := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpscatterqd
void _mm256_i64scatter_epi32 (void* base_addr, __m256i vindex, __m128i a, const int scale)
Synopsis
void _mm256_i64scatter_epi32 (void* base_addr, __m256i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
l := j*64
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
ENDFOR
vpscatterqd
void _mm256_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m256i vindex, __m128i a, const int scale)
Synopsis
void _mm256_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m256i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
l := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpscatterqd
void _mm512_i64scatter_epi32 (void* base_addr, __m512i vindex, __m256i a, int scale)
Synopsis
void _mm512_i64scatter_epi32 (void* base_addr, __m512i vindex, __m256i a, int scale)
#include "immintrin.h"
Instruction: vpscatterqd vm64z {k}, ymm
CPUID Flags: AVX512F
Description
Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
l := j*64
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
ENDFOR
vpscatterqd
void _mm512_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m512i vindex, __m256i a, int scale)
Synopsis
void _mm512_mask_i64scatter_epi32 (void* base_addr, __mmask8 k, __m512i vindex, __m256i a, int scale)
#include "immintrin.h"
Instruction: vpscatterqd vm64z {k}, ymm
CPUID Flags: AVX512F
Description
Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
...
void _mm512_i64scatter_epi32lo (void * mv, __m512i index, __m512i v1, int scale)
Synopsis
void _mm512_i64scatter_epi32lo (void * mv, __m512i index, __m512i v1, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Stores 8 packed 32-bit integer elements in v1 in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale.
Operation
FOR j := 0 to 7
i := j*32
addr := MEM[mv + index[j] * scale]
addr[i+31:i] := v1[i+31:i]
ENDFOR
...
void _mm512_mask_i64scatter_epi32lo (void * mv, __mmask8 k, __m512i index, __m512i v1, int scale)
Synopsis
void _mm512_mask_i64scatter_epi32lo (void * mv, __mmask8 k, __m512i index, __m512i v1, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Stores 8 packed 32-bit integer elements in v1 in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using writemask k (elements are only written to memory when the corresponding mask bit is set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
addr := MEM[mv + index[j] * scale]
addr[i+31:i] := v1[i+31:i]
FI
ENDFOR
vpscatterqq
void _mm_i64scatter_epi64 (void* base_addr, __m128i vindex, __m128i a, const int scale)
Synopsis
void _mm_i64scatter_epi64 (void* base_addr, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqq
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
ENDFOR
vpscatterqq
void _mm_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
Synopsis
void _mm_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m128i vindex, __m128i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqq
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpscatterqq
void _mm256_i64scatter_epi64 (void* base_addr, __m256i vindex, __m256i a, const int scale)
Synopsis
void _mm256_i64scatter_epi64 (void* base_addr, __m256i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqq
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
ENDFOR
vpscatterqq
void _mm256_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m256i vindex, __m256i a, const int scale)
Synopsis
void _mm256_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m256i vindex, __m256i a, const int scale)
#include "immintrin.h"
Instruction: vpscatterqq
CPUID Flags: AVX512VL + AVX512F
Description
Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpscatterqq
void _mm512_i64scatter_epi64 (void* base_addr, __m512i vindex, __m512i a, int scale)
Synopsis
void _mm512_i64scatter_epi64 (void* base_addr, __m512i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterqq vm64z {k}, zmm
CPUID Flags: AVX512F
Description
Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
ENDFOR
vpscatterqq
void _mm512_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m512i vindex, __m512i a, int scale)
Synopsis
void _mm512_mask_i64scatter_epi64 (void* base_addr, __mmask8 k, __m512i vindex, __m512i a, int scale)
#include "immintrin.h"
Instruction: vpscatterqq vm64z {k}, zmm
CPUID Flags: AVX512F
Description
Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vscatterqpd
void _mm_i64scatter_pd (void* base_addr, __m128i vindex, __m128d a, const int scale)
Synopsis
void _mm_i64scatter_pd (void* base_addr, __m128i vindex, __m128d a, const int scale)
#include "immintrin.h"
Instruction: vscatterqpd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
ENDFOR
vscatterqpd
void _mm_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m128d a, const int scale)
Synopsis
void _mm_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m128i vindex, __m128d a, const int scale)
#include "immintrin.h"
Instruction: vscatterqpd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vscatterqpd
void _mm256_i64scatter_pd (void* base_addr, __m256i vindex, __m256d a, const int scale)
Synopsis
void _mm256_i64scatter_pd (void* base_addr, __m256i vindex, __m256d a, const int scale)
#include "immintrin.h"
Instruction: vscatterqpd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
ENDFOR
vscatterqpd
void _mm256_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m256i vindex, __m256d a, const int scale)
Synopsis
void _mm256_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m256i vindex, __m256d a, const int scale)
#include "immintrin.h"
Instruction: vscatterqpd
CPUID Flags: AVX512VL + AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vscatterqpd
void _mm512_i64scatter_pd (void* base_addr, __m512i vindex, __m512d a, int scale)
Synopsis
void _mm512_i64scatter_pd (void* base_addr, __m512i vindex, __m512d a, int scale)
#include "immintrin.h"
Instruction: vscatterqpd vm32z {k}, zmm
CPUID Flags: AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
ENDFOR
vscatterqpd
void _mm512_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m512i vindex, __m512d a, int scale)
Synopsis
void _mm512_mask_i64scatter_pd (void* base_addr, __mmask8 k, __m512i vindex, __m512d a, int scale)
#include "immintrin.h"
Instruction: vscatterqpd vm32z {k}, zmm
CPUID Flags: AVX512F
Description
Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[i+63:i])*scale] := a[i+63:i]
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vscatterqps
void _mm_i64scatter_ps (void* base_addr, __m128i vindex, __m128 a, const int scale)
Synopsis
void _mm_i64scatter_ps (void* base_addr, __m128i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterqps
CPUID Flags: AVX512VL + AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
l := j*64
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
ENDFOR
vscatterqps
void _mm_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m128i vindex, __m128 a, const int scale)
Synopsis
void _mm_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m128i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterqps
CPUID Flags: AVX512VL + AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 1
i := j*32
l := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vscatterqps
void _mm256_i64scatter_ps (void* base_addr, __m256i vindex, __m128 a, const int scale)
Synopsis
void _mm256_i64scatter_ps (void* base_addr, __m256i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterqps
CPUID Flags: AVX512VL + AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
l := j*64
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
ENDFOR
vscatterqps
void _mm256_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m256i vindex, __m128 a, const int scale)
Synopsis
void _mm256_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m256i vindex, __m128 a, const int scale)
#include "immintrin.h"
Instruction: vscatterqps
CPUID Flags: AVX512VL + AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 3
i := j*32
l := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vscatterqps
void _mm512_i64scatter_ps (void* base_addr, __m512i vindex, __m256 a, int scale)
Synopsis
void _mm512_i64scatter_ps (void* base_addr, __m512i vindex, __m256 a, int scale)
#include "immintrin.h"
Instruction: vscatterqps vm32z {k}, ymm
CPUID Flags: AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
l := j*64
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
ENDFOR
vscatterqps
void _mm512_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m512i vindex, __m256 a, int scale)
Synopsis
void _mm512_mask_i64scatter_ps (void* base_addr, __mmask8 k, __m512i vindex, __m256 a, int scale)
#include "immintrin.h"
Instruction: vscatterqps vm32z {k}, ymm
CPUID Flags: AVX512F
Description
Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*32
l := j*64
IF k[j]
MEM[base_addr + SignExtend(vindex[l+63:l])*scale] := a[i+31:i]
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
...
void _mm512_i64scatter_pslo (void * mv, __m512i index, __m512 v, int scale)
Synopsis
void _mm512_i64scatter_pslo (void * mv, __m512i index, __m512 v, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Stores 8 packed single-precision (32-bit) floating-point elements in v in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale.
Operation
FOR j := 0 to 7
i := j*32
addr := MEM[mv + index[j] * scale]
addr[i+31:i] := v[i+31:i]
ENDFOR
...
void _mm512_mask_i64scatter_pslo (void * mv, __mmask8 k, __m512i index, __m512 v1, int scale)
Synopsis
void _mm512_mask_i64scatter_pslo (void * mv, __mmask8 k, __m512i index, __m512 v1, int scale)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Stores 8 packed single-precision (32-bit) floating-point elements in v1 in memory locations starting at location mv at packed 64-bit integer indices stored in index scaled by scale using writemask k (elements are only written to memory when the corresponding mask bit is set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
addr := MEM[mv + index[j] * scale]
addr[i+31:i] := v1[i+31:i]
FI
ENDFOR
...
__m128i _mm_idiv_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_idiv_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_idiv_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_idiv_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m128i _mm_idivrem_epi32 (__m128i * mem_addr, __m128i a, __m128i b)
Synopsis
__m128i _mm_idivrem_epi32 (__m128i * mem_addr, __m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 32-bit integers in a by packed elements in b, store the truncated results in dst, and store the remainders as packed 32-bit integers into memory at mem_addr.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_idivrem_epi32 (__m256i * mem_addr, __m256i a, __m256i b)
Synopsis
__m256i _mm256_idivrem_epi32 (__m256i * mem_addr, __m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 32-bit integers in a by packed elements in b, store the truncated results in dst, and store the remainders as packed 32-bit integers into memory at mem_addr.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
pinsrw
__m128i _mm_insert_epi16 (__m128i a, int i, int imm8)
Synopsis
__m128i _mm_insert_epi16 (__m128i a, int i, int imm8)
#include "emmintrin.h"
Instruction: pinsrw xmm, r32, imm
CPUID Flags: SSE2
Description
Copy a to dst, and insert the 16-bit integer i into dst at the location specified by imm8.
Operation
dst[127:0] := a[127:0]
sel := imm8[2:0]*16
dst[sel+15:sel] := i[15:0]
Performance
...
__m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index)
Synopsis
__m256i _mm256_insert_epi16 (__m256i a, __int16 i, const int index)
#include "immintrin.h"
CPUID Flags: AVX
Description
Copy a to dst, and insert the 16-bit integer i into dst at the location specified by index.
Operation
dst[255:0] := a[255:0]
sel := index*16
dst[sel+15:sel] := i[15:0]
pinsrd
__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8)
Synopsis
__m128i _mm_insert_epi32 (__m128i a, int i, const int imm8)
#include "smmintrin.h"
Instruction: pinsrd xmm, r32, imm
CPUID Flags: SSE4.1
Description
Copy a to dst, and insert the 32-bit integer i into dst at the location specified by imm8.
Operation
dst[127:0] := a[127:0]
sel := imm8[1:0]*32
dst[sel+31:sel] := i[31:0]
Performance
...
__m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index)
Synopsis
__m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index)
#include "immintrin.h"
CPUID Flags: AVX
Description
Copy a to dst, and insert the 32-bit integer i into dst at the location specified by index.
Operation
dst[255:0] := a[255:0]
sel := index*32
dst[sel+31:sel] := i[31:0]
pinsrq
__m128i _mm_insert_epi64 (__m128i a, __int64 i, const int imm8)
Synopsis
__m128i _mm_insert_epi64 (__m128i a, __int64 i, const int imm8)
#include "smmintrin.h"
Instruction: pinsrq xmm, r64, imm
CPUID Flags: SSE4.1
Description
Copy a to dst, and insert the 64-bit integer i into dst at the location specified by imm8.
Operation
dst[127:0] := a[127:0]
sel := imm8[0]*64
dst[sel+63:sel] := i[63:0]
Performance
...
__m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index)
Synopsis
__m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index)
#include "immintrin.h"
CPUID Flags: AVX
Description
Copy a to dst, and insert the 64-bit integer i into dst at the location specified by index.
Operation
dst[255:0] := a[255:0]
sel := index*64
dst[sel+63:sel] := i[63:0]
pinsrb
__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8)
Synopsis
__m128i _mm_insert_epi8 (__m128i a, int i, const int imm8)
#include "smmintrin.h"
Instruction: pinsrb xmm, r32, imm
CPUID Flags: SSE4.1
Description
Copy a to dst, and insert the lower 8-bit integer from i into dst at the location specified by imm8.
Operation
dst[127:0] := a[127:0]
sel := imm8[3:0]*8
dst[sel+7:sel] := i[7:0]
Performance
...
__m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index)
Synopsis
__m256i _mm256_insert_epi8 (__m256i a, __int8 i, const int index)
#include "immintrin.h"
CPUID Flags: AVX
Description
Copy a to dst, and insert the 8-bit integer i into dst at the location specified by index.
Operation
dst[255:0] := a[255:0]
sel := index*8
dst[sel+7:sel] := i[7:0]
pinsrw
__m64 _mm_insert_pi16 (__m64 a, int i, int imm8)
Synopsis
__m64 _mm_insert_pi16 (__m64 a, int i, int imm8)
#include "xmmintrin.h"
Instruction: pinsrw xmm, r32, imm
CPUID Flags: SSE
Description
Copy a to dst, and insert the 16-bit integer i into dst at the location specified by imm8.
Operation
dst[63:0] := a[63:0]
sel := imm8[1:0]*16
dst[sel+15:sel] := i[15:0]
Performance
insertps
__m128 _mm_insert_ps (__m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_insert_ps (__m128 a, __m128 b, const int imm8)
#include "smmintrin.h"
Instruction: insertps xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Copy a to tmp, then insert a single-precision (32-bit) floating-point element from b into tmp using the control in imm8. Store tmp to dst using the mask in imm8 (elements are zeroed out when the corresponding bit is set).
Operation
tmp2[127:0] := a[127:0]
CASE (imm8[7:6]) of
0: tmp1[31:0] := b[31:0]
1: tmp1[31:0] := b[63:32]
2: tmp1[31:0] := b[95:64]
3: tmp1[31:0] := b[127:96]
ESAC
CASE (imm8[5:4]) of
0: tmp2[31:0] := tmp1[31:0]
1: tmp2[63:32] := tmp1[31:0]
2: tmp2[95:64] := tmp1[31:0]
3: tmp2[127:96] := tmp1[31:0]
ESAC
FOR j := 0 to 3
i := j*32
IF imm8[j%8]
dst[i+31:i] := 0
ELSE
dst[i+31:i] := tmp2[i+31:i]
FI
ENDFOR
Performance
vinsertf128
__m256d _mm256_insertf128_pd (__m256d a, __m128d b, int imm8)
Synopsis
__m256d _mm256_insertf128_pd (__m256d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX
Description
Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
Operation
dst[255:0] := a[255:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
Performance
vinsertf128
__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
Synopsis
__m256 _mm256_insertf128_ps (__m256 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX
Description
Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
Operation
dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
Performance
vinsertf128
__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
Synopsis
__m256i _mm256_insertf128_si256 (__m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX
Description
Copy a to dst, then insert 128 bits from b into dst at the location specified by imm8.
Operation
dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
Performance
vinsertf32x4
__m256 _mm256_insertf32x4 (__m256 a, __m128 b, int imm8)
Synopsis
__m256 _mm256_insertf32x4 (__m256 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4
CPUID Flags: AVX512VL + AVX512F
Description
Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
Operation
dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
vinsertf32x4
__m256 _mm256_mask_insertf32x4 (__m256 src, __mmask8 k, __m256 a, __m128 b, int imm8)
Synopsis
__m256 _mm256_mask_insertf32x4 (__m256 src, __mmask8 k, __m256 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4
CPUID Flags: AVX512VL + AVX512F
Description
Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vinsertf32x4
__m256 _mm256_maskz_insertf32x4 (__mmask8 k, __m256 a, __m128 b, int imm8)
Synopsis
__m256 _mm256_maskz_insertf32x4 (__mmask8 k, __m256 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4
CPUID Flags: AVX512VL + AVX512F
Description
Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vinsertf32x4
__m512 _mm512_insertf32x4 (__m512 a, __m128 b, int imm8)
Synopsis
__m512 _mm512_insertf32x4 (__m512 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F
Description
Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
Operation
dst[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0
vinsertf32x4
__m512 _mm512_mask_insertf32x4 (__m512 src, __mmask16 k, __m512 a, __m128 b, int imm8)
Synopsis
__m512 _mm512_mask_insertf32x4 (__m512 src, __mmask16 k, __m512 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F
Description
Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vinsertf32x4
__m512 _mm512_maskz_insertf32x4 (__mmask16 k, __m512 a, __m128 b, int imm8)
Synopsis
__m512 _mm512_maskz_insertf32x4 (__mmask16 k, __m512 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F
Description
Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vinsertf32x8
__m512 _mm512_insertf32x8 (__m512 a, __m256 b, int imm8)
Synopsis
__m512 _mm512_insertf32x8 (__m512 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x8
CPUID Flags: AVX512DQ
Description
Copy a to dst, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
Operation
dst[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0
vinsertf32x8
__m512 _mm512_mask_insertf32x8 (__m512 src, __mmask16 k, __m512 a, __m256 b, int imm8)
Synopsis
__m512 _mm512_mask_insertf32x8 (__m512 src, __mmask16 k, __m512 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x8
CPUID Flags: AVX512DQ
Description
Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vinsertf32x8
__m512 _mm512_maskz_insertf32x8 (__mmask16 k, __m512 a, __m256 b, int imm8)
Synopsis
__m512 _mm512_maskz_insertf32x8 (__mmask16 k, __m512 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vinsertf32x8
CPUID Flags: AVX512DQ
Description
Copy a to tmp, then insert 256 bits (composed of 8 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vinsertf64x2
__m256d _mm256_insertf64x2 (__m256d a, __m128d b, int imm8)
Synopsis
__m256d _mm256_insertf64x2 (__m256d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
Operation
dst[255:0] := a[255:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
vinsertf64x2
__m256d _mm256_mask_insertf64x2 (__m256d src, __mmask8 k, __m256d a, __m128d b, int imm8)
Synopsis
__m256d _mm256_mask_insertf64x2 (__m256d src, __mmask8 k, __m256d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vinsertf64x2
__m256d _mm256_maskz_insertf64x2 (__mmask8 k, __m256d a, __m128d b, int imm8)
Synopsis
__m256d _mm256_maskz_insertf64x2 (__mmask8 k, __m256d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vinsertf64x2
__m512d _mm512_insertf64x2 (__m512d a, __m128d b, int imm8)
Synopsis
__m512d _mm512_insertf64x2 (__m512d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512DQ
Description
Copy a to dst, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
Operation
dst[511:0] := a[511:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0
vinsertf64x2
__m512d _mm512_mask_insertf64x2 (__m512d src, __mmask8 k, __m512d a, __m128d b, int imm8)
Synopsis
__m512d _mm512_mask_insertf64x2 (__m512d src, __mmask8 k, __m512d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512DQ
Description
Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vinsertf64x2
__m512d _mm512_maskz_insertf64x2 (__mmask8 k, __m512d a, __m128d b, int imm8)
Synopsis
__m512d _mm512_maskz_insertf64x2 (__mmask8 k, __m512d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x2
CPUID Flags: AVX512DQ
Description
Copy a to tmp, then insert 128 bits (composed of 2 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vinsertf64x4
__m512d _mm512_insertf64x4 (__m512d a, __m256d b, int imm8)
Synopsis
__m512d _mm512_insertf64x4 (__m512d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F
Description
Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
Operation
dst[511:0] := a[511:0]
CASE (imm8[0]) of
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0
vinsertf64x4
__m512d _mm512_mask_insertf64x4 (__m512d src, __mmask8 k, __m512d a, __m256d b, int imm8)
Synopsis
__m512d _mm512_mask_insertf64x4 (__m512d src, __mmask8 k, __m512d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F
Description
Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vinsertf64x4
__m512d _mm512_maskz_insertf64x4 (__mmask8 k, __m512d a, __m256d b, int imm8)
Synopsis
__m512d _mm512_maskz_insertf64x4 (__mmask8 k, __m512d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vinsertf64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F
Description
Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vinserti128
__m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8)
Synopsis
__m256i _mm256_inserti128_si256 (__m256i a, __m128i b, const int imm8)
#include "immintrin.h"
Instruction: vinserti128 ymm, ymm, xmm, imm
CPUID Flags: AVX2
Description
Copy a to dst, then insert 128 bits (composed of integer data) from b into dst at the location specified by imm8.
Operation
dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
Performance
vinserti32x4
__m256i _mm256_inserti32x4 (__m256i a, __m128i b, int imm8)
Synopsis
__m256i _mm256_inserti32x4 (__m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4
CPUID Flags: AVX512VL + AVX512F
Description
Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
Operation
dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
vinserti32x4
__m256i _mm256_mask_inserti32x4 (__m256i src, __mmask8 k, __m256i a, __m128i b, int imm8)
Synopsis
__m256i _mm256_mask_inserti32x4 (__m256i src, __mmask8 k, __m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4
CPUID Flags: AVX512VL + AVX512F
Description
Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vinserti32x4
__m256i _mm256_maskz_inserti32x4 (__mmask8 k, __m256i a, __m128i b, int imm8)
Synopsis
__m256i _mm256_maskz_inserti32x4 (__mmask8 k, __m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4
CPUID Flags: AVX512VL + AVX512F
Description
Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vinserti32x4
__m512i _mm512_inserti32x4 (__m512i a, __m128i b, int imm8)
Synopsis
__m512i _mm512_inserti32x4 (__m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F
Description
Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
Operation
dst[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0
vinserti32x4
__m512i _mm512_mask_inserti32x4 (__m512i src, __mmask16 k, __m512i a, __m128i b, int imm8)
Synopsis
__m512i _mm512_mask_inserti32x4 (__m512i src, __mmask16 k, __m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F
Description
Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vinserti32x4
__m512i _mm512_maskz_inserti32x4 (__mmask16 k, __m512i a, __m128i b, int imm8)
Synopsis
__m512i _mm512_maskz_inserti32x4 (__mmask16 k, __m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x4 zmm {k}, zmm, xmm, imm
CPUID Flags: AVX512F
Description
Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vinserti32x8
__m512i _mm512_inserti32x8 (__m512i a, __m256i b, int imm8)
Synopsis
__m512i _mm512_inserti32x8 (__m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x8
CPUID Flags: AVX512DQ
Description
Copy a to dst, then insert 256 bits (composed of 8 packed 32-bit integers) from b into dst at the location specified by imm8.
Operation
dst[511:0] := a[511:0]
CASE imm8[7:0] of
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0
vinserti32x8
__m512i _mm512_mask_inserti32x8 (__m512i src, __mmask16 k, __m512i a, __m256i b, int imm8)
Synopsis
__m512i _mm512_mask_inserti32x8 (__m512i src, __mmask16 k, __m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x8
CPUID Flags: AVX512DQ
Description
Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vinserti32x8
__m512i _mm512_maskz_inserti32x8 (__mmask16 k, __m512i a, __m256i b, int imm8)
Synopsis
__m512i _mm512_maskz_inserti32x8 (__mmask16 k, __m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti32x8
CPUID Flags: AVX512DQ
Description
Copy a to tmp, then insert 256 bits (composed of 8 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vinserti64x2
__m256i _mm256_inserti64x2 (__m256i a, __m128i b, int imm8)
Synopsis
__m256i _mm256_inserti64x2 (__m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the location specified by imm8.
Operation
dst[255:0] := a[255:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0
vinserti64x2
__m256i _mm256_mask_inserti64x2 (__m256i src, __mmask8 k, __m256i a, __m128i b, int imm8)
Synopsis
__m256i _mm256_mask_inserti64x2 (__m256i src, __mmask8 k, __m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vinserti64x2
__m256i _mm256_maskz_inserti64x2 (__mmask8 k, __m256i a, __m128i b, int imm8)
Synopsis
__m256i _mm256_maskz_inserti64x2 (__mmask8 k, __m256i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512VL + AVX512DQ
Description
Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
ESAC
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vinserti64x2
__m512i _mm512_inserti64x2 (__m512i a, __m128i b, int imm8)
Synopsis
__m512i _mm512_inserti64x2 (__m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512DQ
Description
Copy a to dst, then insert 128 bits (composed of 2 packed 64-bit integers) from b into dst at the location specified by imm8.
Operation
dst[511:0] := a[511:0]
CASE imm8[7:0] of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
2: dst[383:256] := b[127:0]
3: dst[511:384] := b[127:0]
ESAC
dst[MAX:512] := 0
vinserti64x2
__m512i _mm512_mask_inserti64x2 (__m512i src, __mmask8 k, __m512i a, __m128i b, int imm8)
Synopsis
__m512i _mm512_mask_inserti64x2 (__m512i src, __mmask8 k, __m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512DQ
Description
Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vinserti64x2
__m512i _mm512_maskz_inserti64x2 (__mmask8 k, __m512i a, __m128i b, int imm8)
Synopsis
__m512i _mm512_maskz_inserti64x2 (__mmask8 k, __m512i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x2
CPUID Flags: AVX512DQ
Description
Copy a to tmp, then insert 128 bits (composed of 2 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[1:0]) of
0: tmp[127:0] := b[127:0]
1: tmp[255:128] := b[127:0]
2: tmp[383:256] := b[127:0]
3: tmp[511:384] := b[127:0]
ESAC
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vinserti64x4
__m512i _mm512_inserti64x4 (__m512i a, __m256i b, int imm8)
Synopsis
__m512i _mm512_inserti64x4 (__m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F
Description
Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
Operation
dst[511:0] := a[511:0]
CASE (imm8[7:0]) OF
0: dst[255:0] := b[255:0]
1: dst[511:256] := b[255:0]
ESAC
dst[MAX:512] := 0
vinserti64x4
__m512i _mm512_mask_inserti64x4 (__m512i src, __mmask8 k, __m512i a, __m256i b, int imm8)
Synopsis
__m512i _mm512_mask_inserti64x4 (__m512i src, __mmask8 k, __m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F
Description
Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vinserti64x4
__m512i _mm512_maskz_inserti64x4 (__mmask8 k, __m512i a, __m256i b, int imm8)
Synopsis
__m512i _mm512_maskz_inserti64x4 (__mmask8 k, __m512i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vinserti64x4 zmm {k}, zmm, ymm, imm
CPUID Flags: AVX512F
Description
Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[511:0] := a[511:0]
CASE (imm8[0]) of
0: tmp[255:0] := b[255:0]
1: tmp[511:256] := b[255:0]
ESAC
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
kmov
__mmask16 _mm512_int2mask (int mask)
Synopsis
__mmask16 _mm512_int2mask (int mask)
#include "immintrin.h"
Instruction: kmov k, r32
CPUID Flags: KNCNI
Description
Converts integer mask into bitmask, storing the result in dst.
Operation
dst := mask[15:0]
...
__m128d _mm_invcbrt_pd (__m128d a)
Synopsis
__m128d _mm_invcbrt_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := InvCubeRoot(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_invcbrt_pd (__m256d a)
Synopsis
__m256d _mm256_invcbrt_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := InvCubeRoot(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m128 _mm_invcbrt_ps (__m128 a)
Synopsis
__m128 _mm_invcbrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := InvCubeRoot(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_invcbrt_ps (__m256 a)
Synopsis
__m256 _mm256_invcbrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := InvCubeRoot(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
invpcid
void _invpcid (unsigned int type, void* descriptor)
Synopsis
void _invpcid (unsigned int type, void* descriptor)
#include "immintrin.h"
Instruction: invpcid r32, m128
CPUID Flags: INVPCID
Description
Invalidate mappings in the Translation Lookaside Buffers (TLBs) and paging-structure caches for the processor context identifier (PCID) specified by descriptor based on the invalidation type specified in type.
The PCID descriptor is specified as a 16-byte memory operand (with no alignment restrictions) where bits [11:0] specify the PCID, and bits [127:64] specify the linear address; bits [63:12] are reserved.
The types supported are:
0) Individual-address invalidation: If type is 0, the logical processor invalidates mappings for a single linear address and tagged with the PCID specified in descriptor, except global translations. The instruction may also invalidate global translations, mappings for other linear addresses, or mappings tagged with other PCIDs.
1) Single-context invalidation: If type is 1, the logical processor invalidates all mappings tagged with the PCID specified in descriptor except global translations. In some cases, it may invalidate mappings for other PCIDs as well.
2) All-context invalidation: If type is 2, the logical processor invalidates all mappings tagged with any PCID.
3) All-context invalidation, retaining global translations: If type is 3, the logical processor invalidates all mappings tagged with any PCID except global translations, ignoring descriptor. The instruction may also invalidate global translations as well.
Operation
CASE type OF
0: // individual-address invalidation retaining global translations
OP_PCID := descriptor[11:0]
ADDR := descriptor[127:64]
BREAK
1: // single PCID invalidation retaining globals
OP_PCID := descriptor[11:0]
// invalidate all mappings tagged with OP_PCID except global translations
BREAK
2: // all PCID invalidation
// invalidate all mappings tagged with any PCID
BREAK
3: // all PCID invalidation retaining global translations
// invalidate all mappings tagged with any PCID except global translations
BREAK
ESAC
...
__m128d _mm_invsqrt_pd (__m128d a)
Synopsis
__m128d _mm_invsqrt_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := InvSQRT(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_invsqrt_pd (__m256d a)
Synopsis
__m256d _mm256_invsqrt_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := InvSQRT(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_invsqrt_pd (__m512d a)
Synopsis
__m512d _mm512_invsqrt_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := InvSQRT(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_invsqrt_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_invsqrt_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := InvSQRT(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_invsqrt_ps (__m128 a)
Synopsis
__m128 _mm_invsqrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the inverse square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := InvSQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_invsqrt_ps (__m256 a)
Synopsis
__m256 _mm256_invsqrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the inverse square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := InvSQRT(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_invsqrt_ps (__m512 a)
Synopsis
__m512 _mm512_invsqrt_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := InvSQRT(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_invsqrt_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_invsqrt_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the inverse square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := InvSQRT(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_irem_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_irem_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_irem_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_irem_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
kandw
__mmask16 _mm512_kand (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kand (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kandw k, k, k
CPUID Flags: AVX512F
Description
Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := a[15:0] AND b[15:0]
k[MAX:16] := 0
kand
__mmask16 _mm512_kand (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kand (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kand k, k
CPUID Flags: KNCNI
Description
Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := a[15:0] AND b[15:0]
k[MAX:16] := 0
kandn
__mmask16 _mm512_kandn (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kandn (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kandn k, k
CPUID Flags: KNCNI
Description
Compute the bitwise AND NOT of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := (NOT a[15:0]) AND b[15:0]
k[MAX:16] := 0
kandnw
__mmask16 _mm512_kandn (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kandn (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kandnw k, k, k
CPUID Flags: AVX512F
Description
Compute the bitwise AND NOT of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := (NOT a[15:0]) AND b[15:0]
k[MAX:16] := 0
kandnr
__mmask16 _mm512_kandnr (__mmask16 k1, __mmask16 k2)
Synopsis
__mmask16 _mm512_kandnr (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kandnr k, k
CPUID Flags: KNCNI
Description
Performs a bitwise AND operation between NOT of k2 and k1, storing the result in dst.
Operation
dst[15:0] := NOT(k2[15:0]) & k1[15:0]
kconcath
__int64 _mm512_kconcathi_64 (__mmask16 k1, __mmask16 k2)
Synopsis
__int64 _mm512_kconcathi_64 (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kconcath r, k, k
CPUID Flags: KNCNI
Description
Packs masks k1 and k2 into the high 32 bits of dst. The rest of dst is set to 0.
Operation
dst[63:48] := k1[15:0]
dst[47:32] := k2[15:0]
dst[31:0] := 0
kconcatl
__int64 _mm512_kconcatlo_64 (__mmask16 k1, __mmask16 k2)
Synopsis
__int64 _mm512_kconcatlo_64 (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kconcatl r, k, k
CPUID Flags: KNCNI
Description
Packs masks k1 and k2 into the low 32 bits of dst. The rest of dst is set to 0.
Operation
dst[31:16] := k1[15:0]
dst[15:0] := k2[15:0]
dst[63:32] := 0
kextract
__mmask16 _mm512_kextract_64 (__int64 a, const int b)
Synopsis
__mmask16 _mm512_kextract_64 (__int64 a, const int b)
#include "immintrin.h"
Instruction: kextract k, r, imm
CPUID Flags: KNCNI
Description
Extracts 16-bit value b from 64-bit integer a, storing the result in dst.
Operation
CASE b of
0: dst[15:0] := a[63:48]
1: dst[15:0] := a[47:32]
2: dst[15:0] := a[31:16]
3: dst[15:0] := a[15:0]
ESAC
dst[MAX:15] := 0
kmerge2l1h
__mmask16 _mm512_kmerge2l1h (__mmask16 k1, __mmask16 k2)
Synopsis
__mmask16 _mm512_kmerge2l1h (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kmerge2l1h k, k
CPUID Flags: KNCNI
Description
Move the high element from k1 to the low element of k1, and insert the low element of k2 into the high element of k1.
Operation
tmp[7:0] := k1[15:8]
k1[15:8] := k2[7:0]
k1[7:0] := tmp[7:0]
kmerge2l1l
__mmask16 _mm512_kmerge2l1l (__mmask16 k1, __mmask16 k2)
Synopsis
__mmask16 _mm512_kmerge2l1l (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kmerge2l1l k, k
CPUID Flags: KNCNI
Description
Insert the low element of k2 into the high element of k1.
Operation
k1[15:8] := k2[7:0]
kmovw
__mmask16 _mm512_kmov (__mmask16 a)
Synopsis
__mmask16 _mm512_kmov (__mmask16 a)
#include "immintrin.h"
Instruction: kmovw k, k
CPUID Flags: AVX512F
Description
Copy 16-bit mask a to k.
Operation
k[15:0] := a[15:0]
k[MAX:16] := 0
kmov
__mmask16 _mm512_kmov (__mmask16 a)
Synopsis
__mmask16 _mm512_kmov (__mmask16 a)
#include "immintrin.h"
Instruction: kmov k, k
CPUID Flags: KNCNI
Description
Copy 16-bit mask a to k.
Operation
k[15:0] := a[15:0]
k[MAX:16] := 0
kmerge2l1l
__mmask16 _mm512_kmovlhb (__mmask16 k1, __mmask16 k2)
Synopsis
__mmask16 _mm512_kmovlhb (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kmerge2l1l k, k
CPUID Flags: KNCNI
Description
Inserts the low byte of mask k2 into the high byte of dst, and copies the low byte of k1 to the low byte of dst.
Operation
dst[7:0] := k1[7:0]
dst[15:8] := k2[7:0]
knot
__mmask16 _mm512_knot (__mmask16 a)
Synopsis
__mmask16 _mm512_knot (__mmask16 a)
#include "immintrin.h"
Instruction: knot k, k
CPUID Flags: KNCNI
Description
Compute the bitwise NOT of 16-bit mask a, and store the result in k.
Operation
k[15:0] := NOT a[15:0]
k[MAX:16] := 0
knotw
__mmask16 _mm512_knot (__mmask16 a)
Synopsis
__mmask16 _mm512_knot (__mmask16 a)
#include "immintrin.h"
Instruction: knotw k, k
CPUID Flags: AVX512F
Description
Compute the bitwise NOT of 16-bit mask a, and store the result in k.
Operation
k[15:0] := NOT a[15:0]
k[MAX:16] := 0
kor
__mmask16 _mm512_kor (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kor k, k
CPUID Flags: KNCNI
Description
Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := a[15:0] OR b[15:0]
k[MAX:16] := 0
korw
__mmask16 _mm512_kor (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: korw k, k, k
CPUID Flags: AVX512F
Description
Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := a[15:0] OR b[15:0]
k[MAX:16] := 0
kortest
int _mm512_kortestc (__mmask16 k1, __mmask16 k2)
Synopsis
int _mm512_kortestc (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kortest k, k
CPUID Flags: KNCNI
Description
Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
Operation
dst[15:0] := k1[15:0] | k2[15:0]
IF PopCount(dst[15:0]) = 16
SetCF()
FI
kortestw
int _mm512_kortestc (__mmask16 k1, __mmask16 k2)
Synopsis
int _mm512_kortestc (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kortestw k, k
CPUID Flags: AVX512F
Description
Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
Operation
dst[15:0] := k1[15:0] | k2[15:0]
IF PopCount(dst[15:0]) = 16
SetCF()
FI
kortestw
int _mm512_kortestz (__mmask16 k1, __mmask16 k2)
Synopsis
int _mm512_kortestz (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kortestw k, k
CPUID Flags: AVX512F
Description
Performs bitwise OR between k1 and k2, storing the result in dst. ZF flag is set if dst is 0.
Operation
dst[15:0] := k1[15:0] | k2[15:0]
IF dst = 0
SetZF()
FI
kortest
int _mm512_kortestz (__mmask16 k1, __mmask16 k2)
Synopsis
int _mm512_kortestz (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kortest k, k
CPUID Flags: KNCNI
Description
Performs bitwise OR between k1 and k2, storing the result in dst. ZF flag is set if dst is 0.
Operation
dst[15:0] := k1[15:0] | k2[15:0]
IF dst = 0
SetZF()
FI
kmerge2l1h
__mmask16 _mm512_kswapb (__mmask16 k1, __mmask16 k2)
Synopsis
__mmask16 _mm512_kswapb (__mmask16 k1, __mmask16 k2)
#include "immintrin.h"
Instruction: kmerge2l1h k, k
CPUID Flags: KNCNI
Description
Moves high byte from k2 to low byte of k1, and moves low byte of k2 to high byte of k1.
Operation
tmp[7:0] := k2[15:8]
k2[15:8] := k1[7:0]
k1[7:0] := tmp[7:0]
tmp[7:0] := k2[7:0]
k2[7:0] := k1[15:8]
k1[15:8] := tmp[7:0]
kunpckbw
__mmask16 _mm512_kunpackb (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kunpackb (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kunpckbw k, k, k
CPUID Flags: AVX512F
Description
Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
Operation
k[7:0] := b[7:0]
k[15:8] := a[7:0]
k[MAX:16] := 0
kunpckdq
__mmask64 _mm512_kunpackd (__mmask64 a, __mmask64 b)
Synopsis
__mmask64 _mm512_kunpackd (__mmask64 a, __mmask64 b)
#include "immintrin.h"
Instruction: kunpckdq
CPUID Flags: AVX512BW
Description
Unpack and interleave 32 bits from masks a and b, and store the 64-bit result in k.
Operation
k[31:0] := a[31:0]
k[63:32] := b[31:0]
k[MAX:64] := 0
kunpckwd
__mmask32 _mm512_kunpackw (__mmask32 a, __mmask32 b)
Synopsis
__mmask32 _mm512_kunpackw (__mmask32 a, __mmask32 b)
#include "immintrin.h"
Instruction: kunpckwd
CPUID Flags: AVX512BW
Description
Unpack and interleave 16 bits from masks a and b, and store the 32-bit result in k.
Operation
k[15:0] := a[15:0]
k[31:16] := b[15:0]
k[MAX:32] := 0
kxnorw
__mmask16 _mm512_kxnor (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kxnor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kxnorw k, k, k
CPUID Flags: AVX512F
Description
Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := NOT (a[15:0] XOR b[15:0])
k[MAX:16] := 0
kxnor
__mmask16 _mm512_kxnor (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kxnor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kxnor k, k
CPUID Flags: KNCNI
Description
Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := NOT (a[15:0] XOR b[15:0])
k[MAX:16] := 0
kxor
__mmask16 _mm512_kxor (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kxor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kxor k, k
CPUID Flags: KNCNI
Description
Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := a[15:0] XOR b[15:0]
k[MAX:16] := 0
kxorw
__mmask16 _mm512_kxor (__mmask16 a, __mmask16 b)
Synopsis
__mmask16 _mm512_kxor (__mmask16 a, __mmask16 b)
#include "immintrin.h"
Instruction: kxorw k, k, k
CPUID Flags: AVX512F
Description
Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
Operation
k[15:0] := a[15:0] XOR b[15:0]
k[MAX:16] := 0
lddqu
__m128i _mm_lddqu_si128 (__m128i const* mem_addr)
Synopsis
__m128i _mm_lddqu_si128 (__m128i const* mem_addr)
#include "pmmintrin.h"
Instruction: lddqu xmm, m128
CPUID Flags: SSE3
Description
Load 128-bits of integer data from unaligned memory into dst. This intrinsic may perform better than _mm_loadu_si128 when the data crosses a cache line boundary.
Operation
dst[127:0] := MEM[mem_addr+127:mem_addr]
vlddqu
__m256i _mm256_lddqu_si256 (__m256i const * mem_addr)
Synopsis
__m256i _mm256_lddqu_si256 (__m256i const * mem_addr)
#include "immintrin.h"
Instruction: vlddqu ymm, m256
CPUID Flags: AVX
Description
Load 256-bits of integer data from unaligned memory into dst. This intrinsic may perform better than _mm256_loadu_si256 when the data crosses a cache line boundary.
Operation
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
lfence
void _mm_lfence (void)
Synopsis
void _mm_lfence (void)
#include "emmintrin.h"
Instruction: lfence
CPUID Flags: SSE2
Description
Perform a serializing operation on all load-from-memory instructions that were issued prior to this instruction. Guarantees that every load instruction that precedes, in program order, is globally visible before any load instruction which follows the fence in program order.
Performance
vmovdqa32
__m128i _mm_mask_load_epi32 (__m128i src, __mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_mask_load_epi32 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqa32
__m128i _mm_maskz_load_epi32 (__mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_maskz_load_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqa32
__m256i _mm256_mask_load_epi32 (__m256i src, __mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_mask_load_epi32 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqa32
__m256i _mm256_maskz_load_epi32 (__mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_maskz_load_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqa32
__m512i _mm512_load_epi32 (void const* mem_addr)
Synopsis
__m512i _mm512_load_epi32 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
dst[511:0] := MEM[mem_addr+511:mem_addr]
dst[MAX:512] := 0
vmovdqa32
__m512i _mm512_mask_load_epi32 (__m512i src, __mmask16 k, void const* mem_addr)
Synopsis
__m512i _mm512_mask_load_epi32 (__m512i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqa32
__m512i _mm512_maskz_load_epi32 (__mmask16 k, void const* mem_addr)
Synopsis
__m512i _mm512_maskz_load_epi32 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovdqa64
__m128i _mm_mask_load_epi64 (__m128i src, __mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_mask_load_epi64 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqa64
__m128i _mm_maskz_load_epi64 (__mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_maskz_load_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqa64
__m256i _mm256_mask_load_epi64 (__m256i src, __mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_mask_load_epi64 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqa64
__m256i _mm256_maskz_load_epi64 (__mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_maskz_load_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqa64
__m512i _mm512_load_epi64 (void const* mem_addr)
Synopsis
__m512i _mm512_load_epi64 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
dst[511:0] := MEM[mem_addr+511:mem_addr]
dst[MAX:512] := 0
vmovdqa64
__m512i _mm512_mask_load_epi64 (__m512i src, __mmask8 k, void const* mem_addr)
Synopsis
__m512i _mm512_mask_load_epi64 (__m512i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqa64
__m512i _mm512_maskz_load_epi64 (__mmask8 k, void const* mem_addr)
Synopsis
__m512i _mm512_maskz_load_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
movapd
__m128d _mm_load_pd (double const* mem_addr)
Synopsis
__m128d _mm_load_pd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movapd xmm, m128
CPUID Flags: SSE2
Description
Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into dst.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovapd
__m128d _mm_mask_load_pd (__m128d src, __mmask8 k, void const* mem_addr)
Synopsis
__m128d _mm_mask_load_pd (__m128d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovapd
__m128d _mm_maskz_load_pd (__mmask8 k, void const* mem_addr)
Synopsis
__m128d _mm_maskz_load_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovapd
__m256d _mm256_load_pd (double const * mem_addr)
Synopsis
__m256d _mm256_load_pd (double const * mem_addr)
#include "immintrin.h"
Instruction: vmovapd ymm, m256
CPUID Flags: AVX
Description
Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into dst.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
vmovapd
__m256d _mm256_mask_load_pd (__m256d src, __mmask8 k, void const* mem_addr)
Synopsis
__m256d _mm256_mask_load_pd (__m256d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovapd
__m256d _mm256_maskz_load_pd (__mmask8 k, void const* mem_addr)
Synopsis
__m256d _mm256_maskz_load_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovapd
__m512d _mm512_load_pd (void const* mem_addr)
Synopsis
__m512d _mm512_load_pd (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
dst[511:0] := MEM[mem_addr+511:mem_addr]
dst[MAX:512] := 0
vmovapd
__m512d _mm512_mask_load_pd (__m512d src, __mmask8 k, void const* mem_addr)
Synopsis
__m512d _mm512_mask_load_pd (__m512d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovapd
__m512d _mm512_maskz_load_pd (__mmask8 k, void const* mem_addr)
Synopsis
__m512d _mm512_maskz_load_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_load_pd1 (double const* mem_addr)
Synopsis
__m128d _mm_load_pd1 (double const* mem_addr)
#include "emmintrin.h"
Instruction: movapd xmm, m128
CPUID Flags: SSE2
Description
Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
Operation
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[127:64] := MEM[mem_addr+63:mem_addr]
movaps
__m128 _mm_load_ps (float const* mem_addr)
Synopsis
__m128 _mm_load_ps (float const* mem_addr)
#include "xmmintrin.h"
Instruction: movaps xmm, m128
CPUID Flags: SSE
Description
Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into dst.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovaps
__m128 _mm_mask_load_ps (__m128 src, __mmask8 k, void const* mem_addr)
Synopsis
__m128 _mm_mask_load_ps (__m128 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovaps
__m128 _mm_maskz_load_ps (__mmask8 k, void const* mem_addr)
Synopsis
__m128 _mm_maskz_load_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovaps
__m256 _mm256_load_ps (float const * mem_addr)
Synopsis
__m256 _mm256_load_ps (float const * mem_addr)
#include "immintrin.h"
Instruction: vmovaps ymm, m256
CPUID Flags: AVX
Description
Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into dst.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
vmovaps
__m256 _mm256_mask_load_ps (__m256 src, __mmask8 k, void const* mem_addr)
Synopsis
__m256 _mm256_mask_load_ps (__m256 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovaps
__m256 _mm256_maskz_load_ps (__mmask8 k, void const* mem_addr)
Synopsis
__m256 _mm256_maskz_load_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovaps
__m512 _mm512_load_ps (void const* mem_addr)
Synopsis
__m512 _mm512_load_ps (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
dst[511:0] := MEM[mem_addr+511:mem_addr]
dst[MAX:512] := 0
vmovaps
__m512 _mm512_mask_load_ps (__m512 src, __mmask16 k, void const* mem_addr)
Synopsis
__m512 _mm512_mask_load_ps (__m512 src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovaps
__m512 _mm512_maskz_load_ps (__mmask16 k, void const* mem_addr)
Synopsis
__m512 _mm512_maskz_load_ps (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_load_ps1 (float const* mem_addr)
Synopsis
__m128 _mm_load_ps1 (float const* mem_addr)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Load a single-precision (32-bit) floating-point element from memory into all elements of dst.
Operation
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[63:32] := MEM[mem_addr+31:mem_addr]
dst[95:64] := MEM[mem_addr+31:mem_addr]
dst[127:96] := MEM[mem_addr+31:mem_addr]
movsd
__m128d _mm_load_sd (double const* mem_addr)
Synopsis
__m128d _mm_load_sd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movsd xmm, m64
CPUID Flags: SSE2
Description
Load a double-precision (64-bit) floating-point element from memory into the lower of dst, and zero the upper element. mem_addr does not need to be aligned on any particular boundary.
Operation
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[127:64] := 0
vmovsd
__m128d _mm_mask_load_sd (__m128d src, __mmask8 k, const double* mem_addr)
Synopsis
__m128d _mm_mask_load_sd (__m128d src, __mmask8 k, const double* mem_addr)
#include "immintrin.h"
Instruction: vmovsd xmm {k}, m64
CPUID Flags: AVX512F
Description
Load a double-precision (64-bit) floating-point element from memory into the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
IF k[0]
dst[63:0] := MEM[mem_addr+63:mem_addr]
ELSE
dst[63:0] := src[63:0]
FI
dst[MAX:64] := 0
vmovsd
__m128d _mm_maskz_load_sd (__mmask8 k, const double* mem_addr)
Synopsis
__m128d _mm_maskz_load_sd (__mmask8 k, const double* mem_addr)
#include "immintrin.h"
Instruction: vmovsd xmm {k}, m64
CPUID Flags: AVX512F
Description
Load a double-precision (64-bit) floating-point element from memory into the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper element of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
IF k[0]
dst[63:0] := MEM[mem_addr+63:mem_addr]
ELSE
dst[63:0] := 0
FI
dst[MAX:64] := 0
movdqa
__m128i _mm_load_si128 (__m128i const* mem_addr)
Synopsis
__m128i _mm_load_si128 (__m128i const* mem_addr)
#include "emmintrin.h"
Instruction: movdqa xmm, m128
CPUID Flags: SSE2
Description
Load 128-bits of integer data from memory into dst.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovdqa
__m256i _mm256_load_si256 (__m256i const * mem_addr)
Synopsis
__m256i _mm256_load_si256 (__m256i const * mem_addr)
#include "immintrin.h"
Instruction: vmovdqa ymm, m256
CPUID Flags: AVX
Description
Load 256-bits of integer data from memory into dst.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
vmovdqa32
__m512i _mm512_load_si512 (void const* mem_addr)
Synopsis
__m512i _mm512_load_si512 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, m512
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Load 512-bits of integer data from memory into dst.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
dst[511:0] := MEM[mem_addr+511:mem_addr]
dst[MAX:512] := 0
movss
__m128 _mm_load_ss (float const* mem_addr)
Synopsis
__m128 _mm_load_ss (float const* mem_addr)
#include "xmmintrin.h"
Instruction: movss xmm, m32
CPUID Flags: SSE
Description
Load a single-precision (32-bit) floating-point element from memory into the lower of dst, and zero the upper 3 elements. mem_addr does not need to be aligned on any particular boundary.
Operation
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[127:32] := 0
vmovss
__m128 _mm_mask_load_ss (__m128 src, __mmask8 k, const float* mem_addr)
Synopsis
__m128 _mm_mask_load_ss (__m128 src, __mmask8 k, const float* mem_addr)
#include "immintrin.h"
Instruction: vmovss xmm {k}, m32
CPUID Flags: AVX512F
Description
Load a single-precision (32-bit) floating-point element from memory into the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and set the upper elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
IF k[0]
dst[31:0] := MEM[mem_addr+31:mem_addr]
ELSE
dst[31:0] := src[31:0]
FI
dst[MAX:32] := 0
vmovss
__m128 _mm_maskz_load_ss (__mmask8 k, const float* mem_addr)
Synopsis
__m128 _mm_maskz_load_ss (__mmask8 k, const float* mem_addr)
#include "immintrin.h"
Instruction: vmovss xmm {k}, m32
CPUID Flags: AVX512F
Description
Load a single-precision (32-bit) floating-point element from memory into the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and set the upper elements of dst to zero. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
IF k[0]
dst[31:0] := MEM[mem_addr+31:mem_addr]
ELSE
dst[31:0] := 0
FI
dst[MAX:32] := 0
...
__m128d _mm_load1_pd (double const* mem_addr)
Synopsis
__m128d _mm_load1_pd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movapd xmm, m128
CPUID Flags: SSE2
Description
Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
Operation
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[127:64] := MEM[mem_addr+63:mem_addr]
...
__m128 _mm_load1_ps (float const* mem_addr)
Synopsis
__m128 _mm_load1_ps (float const* mem_addr)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Load a single-precision (32-bit) floating-point element from memory into all elements of dst.
Operation
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[63:32] := MEM[mem_addr+31:mem_addr]
dst[95:64] := MEM[mem_addr+31:mem_addr]
dst[127:96] := MEM[mem_addr+31:mem_addr]
...
short _loadbe_i16 (void const * ptr)
Synopsis
short _loadbe_i16 (void const * ptr)
#include "immintrin.h"
Description
Loads a big-endian word (16-bit) value from address ptr and stores the result in dst.
Operation
addr := MEM[ptr]
FOR j := 0 to 1
i := j*8
dst[i+7:i] := addr[15-i:15-i-7]
ENDFOR
...
int _loadbe_i32 (void const * ptr)
Synopsis
int _loadbe_i32 (void const * ptr)
#include "immintrin.h"
Description
Loads a big-endian double word (32-bit) value from address ptr and stores the result in dst.
Operation
addr := MEM[ptr]
FOR j := 0 to 4
i := j*8
dst[i+7:i] := addr[31-i:31-i-7]
ENDFOR
...
__int64 _loadbe_i64 (void const * ptr)
Synopsis
__int64 _loadbe_i64 (void const * ptr)
#include "immintrin.h"
Description
Loads a big-endian quad word (64-bit) value from address ptr and stores the result in dst.
Operation
addr := MEM[ptr]
FOR j := 0 to 8
i := j*8
dst[i+7:i] := addr[63-i:63-i-7]
ENDFOR
movddup
__m128d _mm_loaddup_pd (double const* mem_addr)
Synopsis
__m128d _mm_loaddup_pd (double const* mem_addr)
#include "pmmintrin.h"
Instruction: movddup xmm, m64
CPUID Flags: SSE3
Description
Load a double-precision (64-bit) floating-point element from memory into both elements of dst.
Operation
tmp[63:0] := MEM[mem_addr+63:mem_addr]
tmp[127:64] := MEM[mem_addr+63:mem_addr]
movhpd
__m128d _mm_loadh_pd (__m128d a, double const* mem_addr)
Synopsis
__m128d _mm_loadh_pd (__m128d a, double const* mem_addr)
#include "emmintrin.h"
Instruction: movhpd xmm, m64
CPUID Flags: SSE2
Description
Load a double-precision (64-bit) floating-point element from memory into the upper element of dst, and copy the lower element from a to dst. mem_addr does not need to be aligned on any particular boundary.
Operation
dst[63:0] := a[63:0]
dst[127:64] := MEM[mem_addr+63:mem_addr]
Performance
movhps
__m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr)
Synopsis
__m128 _mm_loadh_pi (__m128 a, __m64 const* mem_addr)
#include "xmmintrin.h"
Instruction: movhps xmm, m64
CPUID Flags: SSE
Description
Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of dst, and copy the lower 2 elements from a to dst. mem_addr does not need to be aligned on any particular boundary.
Operation
dst[31:0] := a[31:0]
dst[63:32] := a[63:32]
dst[95:64] := MEM[mem_addr+31:mem_addr]
dst[127:96] := MEM[mem_addr+63:mem_addr+32]
Performance
movq
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
Synopsis
__m128i _mm_loadl_epi64 (__m128i const* mem_addr)
#include "emmintrin.h"
Instruction: movq xmm, m64
CPUID Flags: SSE2
Description
Load 64-bit integer from memory into the first element of dst.
Operation
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[MAX:64] := 0
movlpd
__m128d _mm_loadl_pd (__m128d a, double const* mem_addr)
Synopsis
__m128d _mm_loadl_pd (__m128d a, double const* mem_addr)
#include "emmintrin.h"
Instruction: movlpd xmm, m64
CPUID Flags: SSE2
Description
Load a double-precision (64-bit) floating-point element from memory into the lower element of dst, and copy the upper element from a to dst. mem_addr does not need to be aligned on any particular boundary.
Operation
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[127:64] := a[127:64]
Performance
movlps
__m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr)
Synopsis
__m128 _mm_loadl_pi (__m128 a, __m64 const* mem_addr)
#include "xmmintrin.h"
Instruction: movlps xmm, m64
CPUID Flags: SSE
Description
Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of dst, and copy the upper 2 elements from a to dst. mem_addr does not need to be aligned on any particular boundary.
Operation
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[63:32] := MEM[mem_addr+63:mem_addr+32]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
Performance
...
__m128d _mm_loadr_pd (double const* mem_addr)
Synopsis
__m128d _mm_loadr_pd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movapd xmm, m128
CPUID Flags: SSE2
Description
Load 2 double-precision (64-bit) floating-point elements from memory into dst in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
dst[63:0] := MEM[mem_addr+127:mem_addr+64]
dst[127:64] := MEM[mem_addr+63:mem_addr]
...
__m128 _mm_loadr_ps (float const* mem_addr)
Synopsis
__m128 _mm_loadr_ps (float const* mem_addr)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Load 4 single-precision (32-bit) floating-point elements from memory into dst in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
dst[31:0] := MEM[mem_addr+127:mem_addr+96]
dst[63:32] := MEM[mem_addr+95:mem_addr+64]
dst[95:64] := MEM[mem_addr+63:mem_addr+32]
dst[127:96] := MEM[mem_addr+31:mem_addr]
vmovdqu16
__m128i _mm_mask_loadu_epi16 (__m128i src, __mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_mask_loadu_epi16 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Load packed 16-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu16
__m128i _mm_maskz_loadu_epi16 (__mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_maskz_loadu_epi16 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Load packed 16-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu16
__m256i _mm256_mask_loadu_epi16 (__m256i src, __mmask16 k, void const* mem_addr)
Synopsis
__m256i _mm256_mask_loadu_epi16 (__m256i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Load packed 16-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu16
__m256i _mm256_maskz_loadu_epi16 (__mmask16 k, void const* mem_addr)
Synopsis
__m256i _mm256_maskz_loadu_epi16 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Load packed 16-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu16
__m512i _mm512_mask_loadu_epi16 (__m512i src, __mmask32 k, void const* mem_addr)
Synopsis
__m512i _mm512_mask_loadu_epi16 (__m512i src, __mmask32 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW
Description
Load packed 16-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu16
__m512i _mm512_maskz_loadu_epi16 (__mmask32 k, void const* mem_addr)
Synopsis
__m512i _mm512_maskz_loadu_epi16 (__mmask32 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW
Description
Load packed 16-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := MEM[mem_addr+i+15:mem_addr+i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu32
__m128i _mm_mask_loadu_epi32 (__m128i src, __mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_mask_loadu_epi32 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu32
__m128i _mm_maskz_loadu_epi32 (__mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_maskz_loadu_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu32
__m256i _mm256_mask_loadu_epi32 (__m256i src, __mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_mask_loadu_epi32 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu32
__m256i _mm256_maskz_loadu_epi32 (__mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_maskz_loadu_epi32 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu32
__m512i _mm512_mask_loadu_epi32 (__m512i src, __mmask16 k, void const* mem_addr)
Synopsis
__m512i _mm512_mask_loadu_epi32 (__m512i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32 zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed 32-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu32
__m512i _mm512_maskz_loadu_epi32 (__mmask16 k, void const* mem_addr)
Synopsis
__m512i _mm512_maskz_loadu_epi32 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32 zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed 32-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu64
__m128i _mm_mask_loadu_epi64 (__m128i src, __mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_mask_loadu_epi64 (__m128i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu64
__m128i _mm_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)
Synopsis
__m128i _mm_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu64
__m256i _mm256_mask_loadu_epi64 (__m256i src, __mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_mask_loadu_epi64 (__m256i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu64
__m256i _mm256_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)
Synopsis
__m256i _mm256_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F
Description
Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu64
__m512i _mm512_mask_loadu_epi64 (__m512i src, __mmask8 k, void const* mem_addr)
Synopsis
__m512i _mm512_mask_loadu_epi64 (__m512i src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64 zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed 64-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu64
__m512i _mm512_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)
Synopsis
__m512i _mm512_maskz_loadu_epi64 (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu64 zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed 64-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu8
__m128i _mm_mask_loadu_epi8 (__m128i src, __mmask16 k, void const* mem_addr)
Synopsis
__m128i _mm_mask_loadu_epi8 (__m128i src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Load packed 8-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu8
__m128i _mm_maskz_loadu_epi8 (__mmask16 k, void const* mem_addr)
Synopsis
__m128i _mm_maskz_loadu_epi8 (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Load packed 8-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu8
__m256i _mm256_mask_loadu_epi8 (__m256i src, __mmask32 k, void const* mem_addr)
Synopsis
__m256i _mm256_mask_loadu_epi8 (__m256i src, __mmask32 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Load packed 8-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu8
__m256i _mm256_maskz_loadu_epi8 (__mmask32 k, void const* mem_addr)
Synopsis
__m256i _mm256_maskz_loadu_epi8 (__mmask32 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Load packed 8-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu8
__m512i _mm512_mask_loadu_epi8 (__m512i src, __mmask64 k, void const* mem_addr)
Synopsis
__m512i _mm512_mask_loadu_epi8 (__m512i src, __mmask64 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW
Description
Load packed 8-bit integers from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu8
__m512i _mm512_maskz_loadu_epi8 (__mmask64 k, void const* mem_addr)
Synopsis
__m512i _mm512_maskz_loadu_epi8 (__mmask64 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW
Description
Load packed 8-bit integers from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := MEM[mem_addr+i+7:mem_addr+i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
movupd
__m128d _mm_loadu_pd (double const* mem_addr)
Synopsis
__m128d _mm_loadu_pd (double const* mem_addr)
#include "emmintrin.h"
Instruction: movupd xmm, m128
CPUID Flags: SSE2
Description
Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from memory into dst.
mem_addr does not need to be aligned on any particular boundary.
Operation
dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovupd
__m128d _mm_mask_loadu_pd (__m128d src, __mmask8 k, void const* mem_addr)
Synopsis
__m128d _mm_mask_loadu_pd (__m128d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memoy into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovupd
__m128d _mm_maskz_loadu_pd (__mmask8 k, void const* mem_addr)
Synopsis
__m128d _mm_maskz_loadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memoy into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovupd
__m256d _mm256_loadu_pd (double const * mem_addr)
Synopsis
__m256d _mm256_loadu_pd (double const * mem_addr)
#include "immintrin.h"
Instruction: vmovupd ymm, m256
CPUID Flags: AVX
Description
Load 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from memory into dst.
mem_addr does not need to be aligned on any particular boundary.
Operation
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
vmovupd
__m256d _mm256_mask_loadu_pd (__m256d src, __mmask8 k, void const* mem_addr)
Synopsis
__m256d _mm256_mask_loadu_pd (__m256d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memoy into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovupd
__m256d _mm256_maskz_loadu_pd (__mmask8 k, void const* mem_addr)
Synopsis
__m256d _mm256_maskz_loadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memoy into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovupd
__m512d _mm512_loadu_pd (void const* mem_addr)
Synopsis
__m512d _mm512_loadu_pd (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd zmm {k}, m512
CPUID Flags: AVX512F
Description
Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst.
mem_addr does not need to be aligned on any particular boundary.
Operation
dst[511:0] := MEM[mem_addr+511:mem_addr]
dst[MAX:512] := 0
vmovupd
__m512d _mm512_mask_loadu_pd (__m512d src, __mmask8 k, void const* mem_addr)
Synopsis
__m512d _mm512_mask_loadu_pd (__m512d src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memoy into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovupd
__m512d _mm512_maskz_loadu_pd (__mmask8 k, void const* mem_addr)
Synopsis
__m512d _mm512_maskz_loadu_pd (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovupd zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed double-precision (64-bit) floating-point elements from memoy into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
movups
__m128 _mm_loadu_ps (float const* mem_addr)
Synopsis
__m128 _mm_loadu_ps (float const* mem_addr)
#include "xmmintrin.h"
Instruction: movups xmm, m128
CPUID Flags: SSE
Description
Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from memory into dst.
mem_addr does not need to be aligned on any particular boundary.
Operation
dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovups
__m128 _mm_mask_loadu_ps (__m128 src, __mmask8 k, void const* mem_addr)
Synopsis
__m128 _mm_mask_loadu_ps (__m128 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovups
__m128 _mm_maskz_loadu_ps (__mmask8 k, void const* mem_addr)
Synopsis
__m128 _mm_maskz_loadu_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovups
__m256 _mm256_loadu_ps (float const * mem_addr)
Synopsis
__m256 _mm256_loadu_ps (float const * mem_addr)
#include "immintrin.h"
Instruction: vmovups ymm, m256
CPUID Flags: AVX
Description
Load 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from memory into dst.
mem_addr does not need to be aligned on any particular boundary.
Operation
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
vmovups
__m256 _mm256_mask_loadu_ps (__m256 src, __mmask8 k, void const* mem_addr)
Synopsis
__m256 _mm256_mask_loadu_ps (__m256 src, __mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovups
__m256 _mm256_maskz_loadu_ps (__mmask8 k, void const* mem_addr)
Synopsis
__m256 _mm256_maskz_loadu_ps (__mmask8 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovups
__m512 _mm512_loadu_ps (void const* mem_addr)
Synopsis
__m512 _mm512_loadu_ps (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups zmm {k}, m512
CPUID Flags: AVX512F
Description
Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst.
mem_addr does not need to be aligned on any particular boundary.
Operation
dst[511:0] := MEM[mem_addr+511:mem_addr]
dst[MAX:512] := 0
vmovups
__m512 _mm512_mask_loadu_ps (__m512 src, __mmask16 k, void const* mem_addr)
Synopsis
__m512 _mm512_mask_loadu_ps (__m512 src, __mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovups
__m512 _mm512_maskz_loadu_ps (__mmask16 k, void const* mem_addr)
Synopsis
__m512 _mm512_maskz_loadu_ps (__mmask16 k, void const* mem_addr)
#include "immintrin.h"
Instruction: vmovups zmm {k}, m512
CPUID Flags: AVX512F
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
movdqu
__m128i _mm_loadu_si128 (__m128i const* mem_addr)
Synopsis
__m128i _mm_loadu_si128 (__m128i const* mem_addr)
#include "emmintrin.h"
Instruction: movdqu xmm, m128
CPUID Flags: SSE2
Description
Load 128-bits of integer data from memory into dst.
mem_addr does not need to be aligned on any particular boundary.
Operation
dst[127:0] := MEM[mem_addr+127:mem_addr]
movzwl+movd
__m128i _mm_loadu_si16 (void const* mem_addr)
Synopsis
__m128i _mm_loadu_si16 (void const* mem_addr)
#include "immintrin.h"
Instruction: movzwl+movd
Description
Load unaligned 16-bit integer from memory into the first element of dst.
Operation
dst[15:0] := MEM[mem_addr+15:mem_addr]
dst[MAX:16] := 0
...
__m128i _mm_loadu_si16 (void const* mem_addr)
Synopsis
__m128i _mm_loadu_si16 (void const* mem_addr)
#include "immintrin.h"
CPUID Flags: SSE
Description
Load unaligned 16-bit integer from memory into the first element of dst.
Operation
dst[15:0] := MEM[mem_addr+15:mem_addr]
dst[MAX:16] := 0
vmovdqu
__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
Synopsis
__m256i _mm256_loadu_si256 (__m256i const * mem_addr)
#include "immintrin.h"
Instruction: vmovdqu ymm, m256
CPUID Flags: AVX
Description
Load 256-bits of integer data from memory into dst.
mem_addr does not need to be aligned on any particular boundary.
Operation
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
movd
__m128i _mm_loadu_si32 (void const* mem_addr)
Synopsis
__m128i _mm_loadu_si32 (void const* mem_addr)
#include "immintrin.h"
Instruction: movd xmm, m32
CPUID Flags: SSE
Description
Load unaligned 32-bit integer from memory into the first element of dst.
Operation
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[MAX:32] := 0
movd
__m128i _mm_loadu_si32 (void const* mem_addr)
Synopsis
__m128i _mm_loadu_si32 (void const* mem_addr)
#include "immintrin.h"
Instruction: movd
Description
Load unaligned 32-bit integer from memory into the first element of dst.
Operation
dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[MAX:32] := 0
vmovdqu32
__m512i _mm512_loadu_si512 (void const* mem_addr)
Synopsis
__m512i _mm512_loadu_si512 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovdqu32 zmm {k}, m512
CPUID Flags: AVX512F
Description
Load 512-bits of integer data from memory into dst.
mem_addr does not need to be aligned on any particular boundary.
Operation
dst[511:0] := MEM[mem_addr+511:mem_addr]
dst[MAX:512] := 0
movq
__m128i _mm_loadu_si64 (void const* mem_addr)
Synopsis
__m128i _mm_loadu_si64 (void const* mem_addr)
#include "immintrin.h"
Instruction: movq xmm, m64
CPUID Flags: SSE
Description
Load unaligned 64-bit integer from memory into the first element of dst.
Operation
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[MAX:64] := 0
movq
__m128i _mm_loadu_si64 (void const* mem_addr)
Synopsis
__m128i _mm_loadu_si64 (void const* mem_addr)
#include "immintrin.h"
Instruction: movq
Description
Load unaligned 64-bit integer from memory into the first element of dst.
Operation
dst[63:0] := MEM[mem_addr+63:mem_addr]
dst[MAX:64] := 0
...
__m256 _mm256_loadu2_m128 (float const* hiaddr, float const* loaddr)
Synopsis
__m256 _mm256_loadu2_m128 (float const* hiaddr, float const* loaddr)
#include "immintrin.h"
CPUID Flags: AVX
Description
Load two 128-bit values (composed of 4 packed single-precision (32-bit) floating-point elements) from memory, and combine them into a 256-bit value in dst.
hiaddr and loaddr do not need to be aligned on any particular boundary.
Operation
dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0
...
__m256d _mm256_loadu2_m128d (double const* hiaddr, double const* loaddr)
Synopsis
__m256d _mm256_loadu2_m128d (double const* hiaddr, double const* loaddr)
#include "immintrin.h"
CPUID Flags: AVX
Description
Load two 128-bit values (composed of 2 packed double-precision (64-bit) floating-point elements) from memory, and combine them into a 256-bit value in dst.
hiaddr and loaddr do not need to be aligned on any particular boundary.
Operation
dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0
...
__m256i _mm256_loadu2_m128i (__m128i const* hiaddr, __m128i const* loaddr)
Synopsis
__m256i _mm256_loadu2_m128i (__m128i const* hiaddr, __m128i const* loaddr)
#include "immintrin.h"
CPUID Flags: AVX
Description
Load two 128-bit values (composed of integer data) from memory, and combine them into a 256-bit value in dst.
hiaddr and loaddr do not need to be aligned on any particular boundary.
Operation
dst[127:0] := MEM[loaddr+127:loaddr]
dst[255:128] := MEM[hiaddr+127:hiaddr]
dst[MAX:256] := 0
vloadunpackhd
__m512i _mm512_loadunpackhi_epi32 (__m512i src, void const* mt)
Synopsis
__m512i _mm512_loadunpackhi_epi32 (__m512i src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src.
Operation
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 15
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*4 % 64) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*32
tmp := MEM[addr + loadOffset*4]
dst[i+31:i] := tmp[i+31:i]
FI
loadOffset := loadOffset + 1
ENDFOR
dst[MAX:512] := 0
vloadunpackhd
__m512i _mm512_mask_loadunpackhi_epi32 (__m512i src, __mmask16 k, void const * mt)
Synopsis
__m512i _mm512_mask_loadunpackhi_epi32 (__m512i src, __mmask16 k, void const * mt)
#include "immintrin.h"
Instruction: vloadunpackhd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 15
IF k[j]
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*4 % 64) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*32
tmp := MEM[addr + loadOffset*4]
dst[i+31:i] := tmp[i+31:i]
FI
loadOffset := loadOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackhq
__m512i _mm512_loadunpackhi_epi64 (__m512i src, void const* mt)
Synopsis
__m512i _mm512_loadunpackhi_epi64 (__m512i src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhq zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.
Operation
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 7
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*8) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*64
tmp := MEM[addr + loadOffset*8]
dst[i+63:i] := tmp[i+63:i]
FI
loadOffset := loadOffset + 1
ENDFOR
dst[MAX:512] := 0
vloadunpackhq
__m512i _mm512_mask_loadunpackhi_epi64 (__m512i src, __mmask8 k, void const* mt)
Synopsis
__m512i _mm512_mask_loadunpackhi_epi64 (__m512i src, __mmask8 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhq zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 7
IF k[j]
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*8) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*64
tmp := MEM[addr + loadOffset*8]
dst[i+63:i] := tmp[i+63:i]
FI
loadOffset := loadOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackhpd
__m512d _mm512_loadunpackhi_pd (__m512d src, void const* mt)
Synopsis
__m512d _mm512_loadunpackhi_pd (__m512d src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhpd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.
Operation
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 7
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*8) % 64 == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*64
tmp := MEM[addr + loadOffset*8]
dst[i+63:i] := tmp[i+63:i]
FI
loadOffset := loadOffset + 1
ENDFOR
dst[MAX:512] := 0
vloadunpackhpd
__m512d _mm512_mask_loadunpackhi_pd (__m512d src, __mmask8 k, void const* mt)
Synopsis
__m512d _mm512_mask_loadunpackhi_pd (__m512d src, __mmask8 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhpd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the quadword stream starting at element-aligned address mt-64 and expands them into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 7
IF k[j]
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*8) % 64 == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*64
tmp := MEM[addr + loadOffset*8]
dst[i+63:i] := tmp[i+63:i]
FI
loadOffset := loadOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackhps
__m512 _mm512_loadunpackhi_ps (__m512 src, void const* mt)
Synopsis
__m512 _mm512_loadunpackhi_ps (__m512 src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhps zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.
Operation
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 15
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*4 % 64) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*32
tmp := MEM[addr + loadOffset*4]
dst[i+31:i] := tmp[i+31:i]
FI
loadOffset := loadOffset + 1
ENDFOR
dst[MAX:512] := 0
vloadunpackhps
__m512 _mm512_mask_loadunpackhi_ps (__m512 src, __mmask16 k, void const* mt)
Synopsis
__m512 _mm512_mask_loadunpackhi_ps (__m512 src, __mmask16 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackhps zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the high-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt-64 and expands them into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quadwords that occur at or after the first 64-byte-aligned address following (mt-64) are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
dst[511:0] := src[511:0]
loadOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 15
IF k[j]
IF foundNext64BytesBoundary == false
IF (addr + (loadOffset + 1)*4 % 64) == 0
foundNext64BytesBoundary := true
FI
ELSE
i := j*32
tmp := MEM[addr + loadOffset*4]
dst[i+31:i] := tmp[i+31:i]
FI
loadOffset := loadOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackld
__m512i _mm512_loadunpacklo_epi32 (__m512i src, void const* mt)
Synopsis
__m512i _mm512_loadunpacklo_epi32 (__m512i src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackld zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expanded into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src.
Operation
dst[511:0] := src[511:0]
loadOffset := 0
addr = mt
FOR j := 0 to 15
i := j*32
tmp := MEM[addr + loadOffset*4]
dst[i+31:i] := tmp[i+31:i]
loadOffset := loadOffset + 1
IF (mt + loadOffset * 4) % 64 == 0
break
FI
ENDFOR
dst[MAX:512] := 0
vloadunpackld
__m512i _mm512_mask_loadunpacklo_epi32 (__m512i src, __mmask16 k, void const* mt)
Synopsis
__m512i _mm512_mask_loadunpacklo_epi32 (__m512i src, __mmask16 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpackld zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the byte/word/doubleword stream starting at element-aligned address mt and expands them into packed 32-bit integers in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
dst[511:0] := src[511:0]
loadOffset := 0
addr = mt
FOR j := 0 to 15
i := j*32
IF k[j]
tmp := MEM[addr + loadOffset*4]
dst[i+31:i] := tmp[i+31:i]
loadOffset := loadOffset + 1
IF (mt + loadOffset * 4) % 64 == 0
break
FI
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklq
__m512i _mm512_loadunpacklo_epi64 (__m512i src, void const* mt)
Synopsis
__m512i _mm512_loadunpacklo_epi64 (__m512i src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklq zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.
Operation
dst[511:0] := src[511:0]
loadOffset := 0
addr = mt
FOR j := 0 to 7
i := j*64
tmp := MEM[addr + loadOffset*8]
dst[i+63:i] := tmp[i+63:i]
loadOffset := loadOffset + 1
IF (addr + loadOffset*8 % 64) == 0
break
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklq
__m512i _mm512_mask_loadunpacklo_epi64 (__m512i src, __mmask8 k, void const* mt)
Synopsis
__m512i _mm512_mask_loadunpacklo_epi64 (__m512i src, __mmask8 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklq zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed 64-bit integers in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
dst[511:0] := src[511:0]
loadOffset := 0
addr = mt
FOR j := 0 to 7
i := j*64
IF k[j]
tmp := MEM[addr + loadOffset*8]
dst[i+63:i] := tmp[i+63:i]
loadOffset := loadOffset + 1
IF (addr + loadOffset*8 % 64) == 0
break
FI
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklpd
__m512d _mm512_loadunpacklo_pd (__m512d src, void const* mt)
Synopsis
__m512d _mm512_loadunpacklo_pd (__m512d src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklpd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src.
Operation
dst[511:0] := src[511:0]
loadOffset := 0
addr = mt
FOR j := 0 to 7
i := j*64
tmp := MEM[addr + loadOffset*8]
dst[i+63:i] := tmp[i+63:i]
loadOffset := loadOffset + 1
IF ((addr + 8*loadOffset) % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklpd
__m512d _mm512_mask_loadunpacklo_pd (__m512d src, __mmask8 k, void const* mt)
Synopsis
__m512d _mm512_mask_loadunpacklo_pd (__m512d src, __mmask8 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklpd zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the quadword stream starting at element-aligned address mt and expands them into packed double-precision (64-bit) floating-point values in dst. The initial values of dst are copied from src. Only those converted quad that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those quadwords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
dst[511:0] := src[511:0]
loadOffset := 0
addr = mt
FOR j := 0 to 7
i := j*64
IF k[j]
tmp := MEM[addr + loadOffset*8]
dst[i+63:i] := tmp[i+63:i]
loadOffset := loadOffset + 1
IF ((addr + 8*loadOffset) % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklps
__m512 _mm512_loadunpacklo_ps (__m512 src, void const* mt)
Synopsis
__m512 _mm512_loadunpacklo_ps (__m512 src, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklps zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src.
Operation
dst[511:0] := src[511:0]
loadOffset := 0
addr = mt
FOR j := 0 to 15
i := j*32
tmp := MEM[addr + loadOffset*4]
dst[i+31:i] := tmp[i+31:i]
loadOffset := loadOffset + 1
IF (mt + loadOffset * 4) % 64 == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vloadunpacklps
__m512 _mm512_mask_loadunpacklo_ps (__m512 src, __mmask16 k, void const* mt)
Synopsis
__m512 _mm512_mask_loadunpacklo_ps (__m512 src, __mmask16 k, void const* mt)
#include "immintrin.h"
Instruction: vloadunpacklps zmm {k}, m512
CPUID Flags: KNCNI
Description
Loads the low-64-byte-aligned portion of the doubleword stream starting at element-aligned address mt and expanded into packed single-precision (32-bit) floating-point elements in dst. The initial values of dst are copied from src. Only those converted doublewords that occur before first 64-byte-aligned address following mt are loaded. Elements in the resulting vector that do not map to those doublewords are taken from src. Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
dst[511:0] := src[511:0]
loadOffset := 0
addr = mt
FOR j := 0 to 15
i := j*32
IF k[j]
tmp := MEM[addr + loadOffset*4]
dst[i+31:i] := tmp[i+31:i]
loadOffset := loadOffset + 1
IF (mt + loadOffset * 4) % 64 == 0
break
FI
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_log_pd (__m128d a)
Synopsis
__m128d _mm_log_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ln(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_log_pd (__m256d a)
Synopsis
__m256d _mm256_log_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ln(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_log_pd (__m512d a)
Synopsis
__m512d _mm512_log_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ln(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_log_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_log_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ln(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_log_ps (__m128 a)
Synopsis
__m128 _mm_log_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_log_ps (__m256 a)
Synopsis
__m256 _mm256_log_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_log_ps (__m512 a)
Synopsis
__m512 _mm512_log_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_log_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_log_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ln(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_log10_pd (__m128d a)
Synopsis
__m128d _mm_log10_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := log10(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_log10_pd (__m256d a)
Synopsis
__m256d _mm256_log10_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := log10(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_log10_pd (__m512d a)
Synopsis
__m512d _mm512_log10_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := log10(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_log10_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_log10_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := log10(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_log10_ps (__m128 a)
Synopsis
__m128 _mm_log10_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := log10(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_log10_ps (__m256 a)
Synopsis
__m256 _mm256_log10_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := log10(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_log10_ps (__m512 a)
Synopsis
__m512 _mm512_log10_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := log10(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_log10_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_log10_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := log10(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_log1p_pd (__m128d a)
Synopsis
__m128d _mm_log1p_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ln(1.0 + a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_log1p_pd (__m256d a)
Synopsis
__m256d _mm256_log1p_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ln(1.0 + a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_log1p_pd (__m512d a)
Synopsis
__m512d _mm512_log1p_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ln(1.0 + a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_log1p_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_log1p_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ln(1.0 + a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_log1p_ps (__m128 a)
Synopsis
__m128 _mm_log1p_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ln(1.0 + a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_log1p_ps (__m256 a)
Synopsis
__m256 _mm256_log1p_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ln(1.0 + a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_log1p_ps (__m512 a)
Synopsis
__m512 _mm512_log1p_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ln(1.0 + a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_log1p_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_log1p_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ln(1.0 + a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_log2_pd (__m128d a)
Synopsis
__m128d _mm_log2_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := log2(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_log2_pd (__m256d a)
Synopsis
__m256d _mm256_log2_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := log2(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_log2_pd (__m512d a)
Synopsis
__m512d _mm512_log2_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := log2(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_log2_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_log2_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := log2(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_log2_ps (__m128 a)
Synopsis
__m128 _mm_log2_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := log2(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_log2_ps (__m256 a)
Synopsis
__m256 _mm256_log2_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := log2(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vlog2ps
__m512 _mm512_log2_ps (__m512 a)
Synopsis
__m512 _mm512_log2_ps (__m512 a)
#include "immintrin.h"
Instruction: vlog2ps zmm {k}, zmm
CPUID Flags: KNCNI
Description
Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := log2(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vlog2ps
__m512 _mm512_mask_log2_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_log2_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vlog2ps zmm {k}, zmm
CPUID Flags: KNCNI
Description
Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := log2(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vlog2ps
__m512 _mm512_log2ae23_ps (__m512 a)
Synopsis
__m512 _mm512_log2ae23_ps (__m512 a)
#include "immintrin.h"
Instruction: vlog2ps zmm {k}, m512
CPUID Flags: KNCNI
Description
Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a with absolute error of 2^(-23) and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := Log2ae23(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vlog2ps
__m512 _mm512_mask_log2ae23_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_log2ae23_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vlog2ps zmm {k}, m512
CPUID Flags: KNCNI
Description
Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in a with absolute error of 2^(-23) and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := Log2ae23(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_logb_pd (__m128d a)
Synopsis
__m128d _mm_logb_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_logb_pd (__m256d a)
Synopsis
__m256d _mm256_logb_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_logb_pd (__m512d a)
Synopsis
__m512d _mm512_logb_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_logb_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_logb_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_logb_ps (__m128 a)
Synopsis
__m128 _mm_logb_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_logb_ps (__m256 a)
Synopsis
__m256 _mm256_logb_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_logb_ps (__m512 a)
Synopsis
__m512 _mm512_logb_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_logb_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_logb_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
rol
unsigned long _lrotl (unsigned long a, int shift)
Synopsis
unsigned long _lrotl (unsigned long a, int shift)
#include "immintrin.h"
Instruction: rol r64, imm
Description
Shift the bits of unsigned 64-bit integer a left by the number of bits specified in shift, rotating the most-significant bit to the least-significant bit location, and store the unsigned result in dst.
Operation
dst := a
count := shift BITWISE AND 63
DO WHILE (count > 0)
tmp[0] := dst[63]
dst := (dst << 1) OR tmp[0]
count := count - 1
OD
Performance
ror
unsigned long _lrotr (unsigned long a, int shift)
Synopsis
unsigned long _lrotr (unsigned long a, int shift)
#include "immintrin.h"
Instruction: ror r64, imm
Description
Shift the bits of unsigned 64-bit integer a right by the number of bits specified in shift, rotating the least-significant bit to the most-significant bit location, and store the unsigned result in dst.
Operation
dst := a
count := shift BITWISE AND 63
DO WHILE (count > 0)
tmp[63] := dst[0]
dst := (dst >> 1) OR tmp[63]
count := count - 1
OD
Performance
vplzcntd
__m128i _mm_lzcnt_epi32 (__m128i a)
Synopsis
__m128i _mm_lzcnt_epi32 (__m128i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
tmp := 31
dst[i+31:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+31:i] := dst[i+31:i] + 1
OD
ENDFOR
dst[MAX:128] := 0
vplzcntd
__m128i _mm_mask_lzcnt_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_lzcnt_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
tmp := 31
dst[i+31:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+31:i] := dst[i+31:i] + 1
OD
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vplzcntd
__m128i _mm_maskz_lzcnt_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_lzcnt_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
tmp := 31
dst[i+31:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+31:i] := dst[i+31:i] + 1
OD
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vplzcntd
__m256i _mm256_lzcnt_epi32 (__m256i a)
Synopsis
__m256i _mm256_lzcnt_epi32 (__m256i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
tmp := 31
dst[i+31:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+31:i] := dst[i+31:i] + 1
OD
ENDFOR
dst[MAX:256] := 0
vplzcntd
__m256i _mm256_mask_lzcnt_epi32 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_lzcnt_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
tmp := 31
dst[i+31:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+31:i] := dst[i+31:i] + 1
OD
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vplzcntd
__m256i _mm256_maskz_lzcnt_epi32 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_lzcnt_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vplzcntd
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
tmp := 31
dst[i+31:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+31:i] := dst[i+31:i] + 1
OD
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vplzcntd
__m512i _mm512_lzcnt_epi32 (__m512i a)
Synopsis
__m512i _mm512_lzcnt_epi32 (__m512i a)
#include "immintrin.h"
Instruction: vplzcntd zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
tmp := 31
dst[i+31:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+31:i] := dst[i+31:i] + 1
OD
ENDFOR
dst[MAX:512] := 0
vplzcntd
__m512i _mm512_mask_lzcnt_epi32 (__m512i src, __mmask16 k, __m512i a)
Synopsis
__m512i _mm512_mask_lzcnt_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vplzcntd zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
tmp := 31
dst[i+31:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+31:i] := dst[i+31:i] + 1
OD
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vplzcntd
__m512i _mm512_maskz_lzcnt_epi32 (__mmask16 k, __m512i a)
Synopsis
__m512i _mm512_maskz_lzcnt_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vplzcntd zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Counts the number of leading zero bits in each packed 32-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
tmp := 31
dst[i+31:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+31:i] := dst[i+31:i] + 1
OD
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vplzcntq
__m128i _mm_lzcnt_epi64 (__m128i a)
Synopsis
__m128i _mm_lzcnt_epi64 (__m128i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
tmp := 63
dst[i+63:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+63:i] := dst[i+63:i] + 1
OD
ENDFOR
dst[MAX:128] := 0
vplzcntq
__m128i _mm_mask_lzcnt_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_lzcnt_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
tmp := 63
dst[i+63:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+63:i] := dst[i+63:i] + 1
OD
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vplzcntq
__m128i _mm_maskz_lzcnt_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_lzcnt_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
tmp := 63
dst[i+63:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+63:i] := dst[i+63:i] + 1
OD
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vplzcntq
__m256i _mm256_lzcnt_epi64 (__m256i a)
Synopsis
__m256i _mm256_lzcnt_epi64 (__m256i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
tmp := 63
dst[i+63:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+63:i] := dst[i+63:i] + 1
OD
ENDFOR
dst[MAX:256] := 0
vplzcntq
__m256i _mm256_mask_lzcnt_epi64 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_lzcnt_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
tmp := 63
dst[i+63:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+63:i] := dst[i+63:i] + 1
OD
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vplzcntq
__m256i _mm256_maskz_lzcnt_epi64 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_lzcnt_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vplzcntq
CPUID Flags: AVX512VL + AVX512CD
Description
Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
tmp := 63
dst[i+63:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+63:i] := dst[i+63:i] + 1
OD
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vplzcntq
__m512i _mm512_lzcnt_epi64 (__m512i a)
Synopsis
__m512i _mm512_lzcnt_epi64 (__m512i a)
#include "immintrin.h"
Instruction: vplzcntq zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
tmp := 63
dst[i+63:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+63:i] := dst[i+63:i] + 1
OD
ENDFOR
dst[MAX:512] := 0
vplzcntq
__m512i _mm512_mask_lzcnt_epi64 (__m512i src, __mmask8 k, __m512i a)
Synopsis
__m512i _mm512_mask_lzcnt_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vplzcntq zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
tmp := 63
dst[i+63:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+63:i] := dst[i+63:i] + 1
OD
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vplzcntq
__m512i _mm512_maskz_lzcnt_epi64 (__mmask8 k, __m512i a)
Synopsis
__m512i _mm512_maskz_lzcnt_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vplzcntq zmm {k}, zmm
CPUID Flags: AVX512CD
Description
Counts the number of leading zero bits in each packed 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
tmp := 63
dst[i+63:i] := 0
DO WHILE (tmp >= 0 AND a[i+tmp] == 0)
tmp := tmp - 1
dst[i+63:i] := dst[i+63:i] + 1
OD
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
lzcnt
unsigned int _lzcnt_u32 (unsigned int a)
Synopsis
unsigned int _lzcnt_u32 (unsigned int a)
#include "immintrin.h"
Instruction: lzcnt r32, r32
CPUID Flags: LZCNT
Description
Count the number of leading zero bits in unsigned 32-bit integer a, and return that count in dst.
Operation
tmp := 31
dst := 0
DO WHILE (tmp >= 0 AND a[tmp] = 0)
tmp := tmp - 1
dst := dst + 1
OD
Performance
lzcnt
unsigned __int64 _lzcnt_u64 (unsigned __int64 a)
Synopsis
unsigned __int64 _lzcnt_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: lzcnt r64, r64
CPUID Flags: LZCNT
Description
Count the number of leading zero bits in unsigned 64-bit integer a, and return that count in dst.
Operation
tmp := 63
dst := 0
DO WHILE (tmp >= 0 AND a[tmp] = 0)
tmp := tmp - 1
dst := dst + 1
OD
Performance
pmaddwd
__m128i _mm_madd_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_madd_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmaddwd xmm, xmm
CPUID Flags: SSE2
Description
Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.
Operation
FOR j := 0 to 3
i := j*32
st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
Performance
vpmaddwd
__m128i _mm_mask_madd_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_madd_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaddwd
__m128i _mm_maskz_madd_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_madd_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmaddwd
__m256i _mm256_madd_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_madd_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddwd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.
Operation
FOR j := 0 to 7
i := j*32
st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpmaddwd
__m256i _mm256_mask_madd_epi16 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_madd_epi16 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaddwd
__m256i _mm256_maskz_madd_epi16 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_madd_epi16 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaddwd
__m512i _mm512_madd_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_madd_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst.
Operation
FOR j := 0 to 15
i := j*32
st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
dst[MAX:512] := 0
vpmaddwd
__m512i _mm512_mask_madd_epi16 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_madd_epi16 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaddwd
__m512i _mm512_maskz_madd_epi16 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_madd_epi16 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddwd
CPUID Flags: AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmadd52huq
__m128i _mm_madd52hi_epu64 (__m128i a, __m128i b, __m128i c)
Synopsis
__m128i _mm_madd52hi_epu64 (__m128i a, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ENDFOR
dst[MAX:128] := 0
vpmadd52huq
__m128i _mm_mask_madd52hi_epu64 (__m128i a, __mmask8 k, __m128i b, __m128i c)
Synopsis
__m128i _mm_mask_madd52hi_epu64 (__m128i a, __mmask8 k, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmadd52huq
__m128i _mm_maskz_madd52hi_epu64 (__mmask8 k, __m128i a, __m128i b, __m128i c)
Synopsis
__m128i _mm_maskz_madd52hi_epu64 (__mmask8 k, __m128i a, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmadd52huq
__m256i _mm256_madd52hi_epu64 (__m256i a, __m256i b, __m256i c)
Synopsis
__m256i _mm256_madd52hi_epu64 (__m256i a, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ENDFOR
dst[MAX:256] := 0
vpmadd52huq
__m256i _mm256_mask_madd52hi_epu64 (__m256i a, __mmask8 k, __m256i b, __m256i c)
Synopsis
__m256i _mm256_mask_madd52hi_epu64 (__m256i a, __mmask8 k, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmadd52huq
__m256i _mm256_maskz_madd52hi_epu64 (__mmask8 k, __m256i a, __m256i b, __m256i c)
Synopsis
__m256i _mm256_maskz_madd52hi_epu64 (__mmask8 k, __m256i a, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmadd52huq
__m512i _mm512_madd52hi_epu64 (__m512i a, __m512i b, __m512i c)
Synopsis
__m512i _mm512_madd52hi_epu64 (__m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ENDFOR
dst[MAX:512] := 0
vpmadd52huq
__m512i _mm512_mask_madd52hi_epu64 (__m512i a, __mmask8 k, __m512i b, __m512i c)
Synopsis
__m512i _mm512_mask_madd52hi_epu64 (__m512i a, __mmask8 k, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmadd52huq
__m512i _mm512_maskz_madd52hi_epu64 (__mmask8 k, __m512i a, __m512i b, __m512i c)
Synopsis
__m512i _mm512_maskz_madd52hi_epu64 (__mmask8 k, __m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52huq
CPUID Flags: AVX512IFMA52
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the high 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[103:52])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmadd52luq
__m128i _mm_madd52lo_epu64 (__m128i a, __m128i b, __m128i c)
Synopsis
__m128i _mm_madd52lo_epu64 (__m128i a, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ENDFOR
dst[MAX:128] := 0
vpmadd52luq
__m128i _mm_mask_madd52lo_epu64 (__m128i a, __mmask8 k, __m128i b, __m128i c)
Synopsis
__m128i _mm_mask_madd52lo_epu64 (__m128i a, __mmask8 k, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmadd52luq
__m128i _mm_maskz_madd52lo_epu64 (__mmask8 k, __m128i a, __m128i b, __m128i c)
Synopsis
__m128i _mm_maskz_madd52lo_epu64 (__mmask8 k, __m128i a, __m128i b, __m128i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmadd52luq
__m256i _mm256_madd52lo_epu64 (__m256i a, __m256i b, __m256i c)
Synopsis
__m256i _mm256_madd52lo_epu64 (__m256i a, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ENDFOR
dst[MAX:256] := 0
vpmadd52luq
__m256i _mm256_mask_madd52lo_epu64 (__m256i a, __mmask8 k, __m256i b, __m256i c)
Synopsis
__m256i _mm256_mask_madd52lo_epu64 (__m256i a, __mmask8 k, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmadd52luq
__m256i _mm256_maskz_madd52lo_epu64 (__mmask8 k, __m256i a, __m256i b, __m256i c)
Synopsis
__m256i _mm256_maskz_madd52lo_epu64 (__mmask8 k, __m256i a, __m256i b, __m256i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52 + AVX512VL
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmadd52luq
__m512i _mm512_madd52lo_epu64 (__m512i a, __m512i b, __m512i c)
Synopsis
__m512i _mm512_madd52lo_epu64 (__m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ENDFOR
dst[MAX:512] := 0
vpmadd52luq
__m512i _mm512_mask_madd52lo_epu64 (__m512i a, __mmask8 k, __m512i b, __m512i c)
Synopsis
__m512i _mm512_mask_madd52lo_epu64 (__m512i a, __mmask8 k, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmadd52luq
__m512i _mm512_maskz_madd52lo_epu64 (__mmask8 k, __m512i a, __m512i b, __m512i c)
Synopsis
__m512i _mm512_maskz_madd52lo_epu64 (__mmask8 k, __m512i a, __m512i b, __m512i c)
#include "immintrin.h"
Instruction: vpmadd52luq
CPUID Flags: AVX512IFMA52
Description
Multiply packed unsigned 52-bit integers in each 64-bit element of b and c to form a 104-bit intermediate result. Add the low 52-bit unsigned integer from the intermediate result with the corresponding unsigned 64-bit integer in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
tmp[127:0] := ZeroExtend64(b[i+51:i]) * ZeroExtend64(c[i+51:i])
dst[i+63:i] := a[i+63:i] + ZeroExtend64(tmp[51:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmaddubsw
__m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_maddubs_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: pmaddubsw xmm, xmm
CPUID Flags: SSSE3
Description
Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
Performance
vpmaddubsw
__m128i _mm_mask_maddubs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_maddubs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaddubsw
__m128i _mm_maskz_maddubs_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_maddubs_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmaddubsw
__m256i _mm256_maddubs_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_maddubs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddubsw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
dst[MAX:256] := 0
Performance
vpmaddubsw
__m256i _mm256_mask_maddubs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_maddubs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaddubsw
__m256i _mm256_maskz_maddubs_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_maddubs_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaddubsw
__m512i _mm512_maddubs_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_maddubs_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512BW
Description
Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
dst[MAX:512] := 0
vpmaddubsw
__m512i _mm512_mask_maddubs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_maddubs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512BW
Description
Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaddubsw
__m512i _mm512_maskz_maddubs_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_maddubs_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaddubsw
CPUID Flags: AVX512BW
Description
Multiply packed unsigned 8-bit integers in a by packed signed 8-bit integers in b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
pmaddubsw
__m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_maddubs_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: pmaddubsw mm, mm
CPUID Flags: SSSE3
Description
Vertically multiply each unsigned 8-bit integer from a with the corresponding signed 8-bit integer from b, producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in dst.
Operation
FOR j := 0 to 3
i := j*16
dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
void* _mm_malloc (size_t size, size_t align)
Synopsis
void* _mm_malloc (size_t size, size_t align)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Allocate size bytes of memory, aligned to the alignment specified in align, and return a pointer to the allocated memory. _mm_free should be used to free memory that is allocated with _mm_malloc.
kmov
int _mm512_mask2int (__mmask16 k1)
Synopsis
int _mm512_mask2int (__mmask16 k1)
#include "immintrin.h"
Instruction: kmov r32, k
CPUID Flags: KNCNI
Description
Converts bit mask k1 into an integer value, storing the results in dst.
Operation
dst := SignExtend(k1)
vpmaskmovd
__m128i _mm_maskload_epi32 (int const* mem_addr, __m128i mask)
Synopsis
__m128i _mm_maskload_epi32 (int const* mem_addr, __m128i mask)
#include "immintrin.h"
Instruction: vpmaskmovd xmm, xmm, m128
CPUID Flags: AVX2
Description
Load packed 32-bit integers from memory into dst using mask (elements are zeroed out when the highest bit is not set in the corresponding element).
Operation
FOR j := 0 to 3
i := j*32
IF mask[i+31]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
Performance
vpmaskmovd
__m256i _mm256_maskload_epi32 (int const* mem_addr, __m256i mask)
Synopsis
__m256i _mm256_maskload_epi32 (int const* mem_addr, __m256i mask)
#include "immintrin.h"
Instruction: vpmaskmovd ymm, ymm, m256
CPUID Flags: AVX2
Description
Load packed 32-bit integers from memory into dst using mask (elements are zeroed out when the highest bit is not set in the corresponding element).
Operation
FOR j := 0 to 7
i := j*32
IF mask[i+31]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpmaskmovq
__m128i _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask)
Synopsis
__m128i _mm_maskload_epi64 (__int64 const* mem_addr, __m128i mask)
#include "immintrin.h"
Instruction: vpmaskmovq xmm, xmm, m128
CPUID Flags: AVX2
Description
Load packed 64-bit integers from memory into dst using mask (elements are zeroed out when the highest bit is not set in the corresponding element).
Operation
FOR j := 0 to 1
i := j*64
IF mask[i+63]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
Performance
vpmaskmovq
__m256i _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask)
Synopsis
__m256i _mm256_maskload_epi64 (__int64 const* mem_addr, __m256i mask)
#include "immintrin.h"
Instruction: vpmaskmovq ymm, ymm, m256
CPUID Flags: AVX2
Description
Load packed 64-bit integers from memory into dst using mask (elements are zeroed out when the highest bit is not set in the corresponding element).
Operation
FOR j := 0 to 3
i := j*64
IF mask[i+63]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
Performance
vmaskmovpd
__m128d _mm_maskload_pd (double const * mem_addr, __m128i mask)
Synopsis
__m128d _mm_maskload_pd (double const * mem_addr, __m128i mask)
#include "immintrin.h"
Instruction: vmaskmovpd xmm, xmm, m128
CPUID Flags: AVX
Description
Load packed double-precision (64-bit) floating-point elements from memory into dst using mask (elements are zeroed out when the high bit of the corresponding element is not set).
Operation
FOR j := 0 to 1
i := j*64
IF mask[i+63]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
Performance
vmaskmovpd
__m256d _mm256_maskload_pd (double const * mem_addr, __m256i mask)
Synopsis
__m256d _mm256_maskload_pd (double const * mem_addr, __m256i mask)
#include "immintrin.h"
Instruction: vmaskmovpd ymm, ymm, m256
CPUID Flags: AVX
Description
Load packed double-precision (64-bit) floating-point elements from memory into dst using mask (elements are zeroed out when the high bit of the corresponding element is not set).
Operation
FOR j := 0 to 3
i := j*64
IF mask[i+63]
dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
Performance
vmaskmovps
__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
Synopsis
__m128 _mm_maskload_ps (float const * mem_addr, __m128i mask)
#include "immintrin.h"
Instruction: vmaskmovps xmm, xmm, m128
CPUID Flags: AVX
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using mask (elements are zeroed out when the high bit of the corresponding element is not set).
Operation
FOR j := 0 to 3
i := j*32
IF mask[i+31]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
Performance
vmaskmovps
__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
Synopsis
__m256 _mm256_maskload_ps (float const * mem_addr, __m256i mask)
#include "immintrin.h"
Instruction: vmaskmovps ymm, ymm, m256
CPUID Flags: AVX
Description
Load packed single-precision (32-bit) floating-point elements from memory into dst using mask (elements are zeroed out when the high bit of the corresponding element is not set).
Operation
FOR j := 0 to 7
i := j*32
IF mask[i+31]
dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
Performance
maskmovq
void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr)
Synopsis
void _mm_maskmove_si64 (__m64 a, __m64 mask, char* mem_addr)
#include "xmmintrin.h"
Instruction: maskmovq mm, mm
CPUID Flags: SSE
Description
Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint.
Operation
FOR j := 0 to 7
i := j*8
IF mask[i+7]
MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
FI
ENDFOR
maskmovdqu
void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
Synopsis
void _mm_maskmoveu_si128 (__m128i a, __m128i mask, char* mem_addr)
#include "emmintrin.h"
Instruction: maskmovdqu xmm, xmm
CPUID Flags: SSE2
Description
Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint. mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*8
IF mask[i+7]
MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
FI
ENDFOR
maskmovq
void _m_maskmovq (__m64 a, __m64 mask, char* mem_addr)
Synopsis
void _m_maskmovq (__m64 a, __m64 mask, char* mem_addr)
#include "xmmintrin.h"
Instruction: maskmovq mm, mm
CPUID Flags: SSE
Description
Conditionally store 8-bit integer elements from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).
Operation
FOR j := 0 to 7
i := j*8
IF mask[i+7]
MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
FI
ENDFOR
vpmaskmovd
void _mm_maskstore_epi32 (int* mem_addr, __m128i mask, __m128i a)
Synopsis
void _mm_maskstore_epi32 (int* mem_addr, __m128i mask, __m128i a)
#include "immintrin.h"
Instruction: vpmaskmovd m128, xmm, xmm
CPUID Flags: AVX2
Description
Store packed 32-bit integers from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).
Operation
FOR j := 0 to 3
i := j*32
IF mask[i+31]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
Performance
vpmaskmovd
void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
Synopsis
void _mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a)
#include "immintrin.h"
Instruction: vpmaskmovd m256, ymm, ymm
CPUID Flags: AVX2
Description
Store packed 32-bit integers from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).
Operation
FOR j := 0 to 7
i := j*32
IF mask[i+31]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
Performance
vpmaskmovq
void _mm_maskstore_epi64 (__int64* mem_addr, __m128i mask, __m128i a)
Synopsis
void _mm_maskstore_epi64 (__int64* mem_addr, __m128i mask, __m128i a)
#include "immintrin.h"
Instruction: vpmaskmovq m128, xmm, xmm
CPUID Flags: AVX2
Description
Store packed 64-bit integers from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).
Operation
FOR j := 0 to 1
i := j*64
IF mask[i+63]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
Performance
vpmaskmovq
void _mm256_maskstore_epi64 (__int64* mem_addr, __m256i mask, __m256i a)
Synopsis
void _mm256_maskstore_epi64 (__int64* mem_addr, __m256i mask, __m256i a)
#include "immintrin.h"
Instruction: vpmaskmovq m256, ymm, ymm
CPUID Flags: AVX2
Description
Store packed 64-bit integers from a into memory using mask (elements are not stored when the highest bit is not set in the corresponding element).
Operation
FOR j := 0 to 3
i := j*64
IF mask[i+63]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
Performance
vmaskmovpd
void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a)
Synopsis
void _mm_maskstore_pd (double * mem_addr, __m128i mask, __m128d a)
#include "immintrin.h"
Instruction: vmaskmovpd m128, xmm, xmm
CPUID Flags: AVX
Description
Store packed double-precision (64-bit) floating-point elements from a into memory using mask.
Operation
FOR j := 0 to 1
i := j*64
IF mask[i+63]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
Performance
vmaskmovpd
void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a)
Synopsis
void _mm256_maskstore_pd (double * mem_addr, __m256i mask, __m256d a)
#include "immintrin.h"
Instruction: vmaskmovpd m256, ymm, ymm
CPUID Flags: AVX
Description
Store packed double-precision (64-bit) floating-point elements from a into memory using mask.
Operation
FOR j := 0 to 3
i := j*64
IF mask[i+63]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
Performance
vmaskmovps
void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
Synopsis
void _mm_maskstore_ps (float * mem_addr, __m128i mask, __m128 a)
#include "immintrin.h"
Instruction: vmaskmovps m128, xmm, xmm
CPUID Flags: AVX
Description
Store packed single-precision (32-bit) floating-point elements from a into memory using mask.
Operation
FOR j := 0 to 3
i := j*32
IF mask[i+31]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
Performance
vmaskmovps
void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
Synopsis
void _mm256_maskstore_ps (float * mem_addr, __m256i mask, __m256 a)
#include "immintrin.h"
Instruction: vmaskmovps m256, ymm, ymm
CPUID Flags: AVX
Description
Store packed single-precision (32-bit) floating-point elements from a into memory using mask.
Operation
FOR j := 0 to 7
i := j*32
IF mask[i+31]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
Performance
vpmaxsw
__m128i _mm_mask_max_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_max_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxsw
__m128i _mm_maskz_max_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_max_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmaxsw
__m128i _mm_max_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_max_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmaxsw xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*16
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
Performance
vpmaxsw
__m256i _mm256_mask_max_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_max_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxsw
__m256i _mm256_maskz_max_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_max_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaxsw
__m256i _mm256_max_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_max_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 15
i := j*16
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpmaxsw
__m512i _mm512_mask_max_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_max_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsw
__m512i _mm512_maskz_max_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_max_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsw
__m512i _mm512_max_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_max_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 31
i := j*16
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsd
__m128i _mm_mask_max_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_max_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxsd
__m128i _mm_maskz_max_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_max_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmaxsd
__m128i _mm_max_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_max_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmaxsd xmm, xmm
CPUID Flags: SSE4.1
Description
Compare packed 32-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 3
i := j*32
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
Performance
vpmaxsd
__m256i _mm256_mask_max_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_max_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxsd
__m256i _mm256_maskz_max_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_max_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaxsd
__m256i _mm256_max_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_max_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 32-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*32
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpmaxsd
__m512i _mm512_mask_max_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_max_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsd
__m512i _mm512_maskz_max_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_max_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsd
__m512i _mm512_max_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_max_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 15
i := j*32
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsq
__m128i _mm_mask_max_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_max_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxsq
__m128i _mm_maskz_max_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_max_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmaxsq
__m128i _mm_max_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_max_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 1
i := j*64
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxsq
__m256i _mm256_mask_max_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_max_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxsq
__m256i _mm256_maskz_max_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_max_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaxsq
__m256i _mm256_max_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_max_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 3
i := j*64
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxsq
__m512i _mm512_mask_max_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_max_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsq
__m512i _mm512_maskz_max_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_max_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsq
__m512i _mm512_max_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_max_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*64
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsb
__m128i _mm_mask_max_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_max_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxsb
__m128i _mm_maskz_max_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_max_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmaxsb
__m128i _mm_max_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_max_epi8 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmaxsb xmm, xmm
CPUID Flags: SSE4.1
Description
Compare packed 8-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 15
i := j*8
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
Performance
vpmaxsb
__m256i _mm256_mask_max_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_max_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxsb
__m256i _mm256_maskz_max_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_max_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaxsb
__m256i _mm256_max_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_max_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxsb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 8-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 31
i := j*8
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpmaxsb
__m512i _mm512_mask_max_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_max_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsb
__m512i _mm512_maskz_max_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_max_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmaxsb
__m512i _mm512_max_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_max_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxsb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 63
i := j*8
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxuw
__m128i _mm_mask_max_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_max_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxuw
__m128i _mm_maskz_max_epu16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_max_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmaxuw
__m128i _mm_max_epu16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_max_epu16 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmaxuw xmm, xmm
CPUID Flags: SSE4.1
Description
Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*16
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
Performance
vpmaxuw
__m256i _mm256_mask_max_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_max_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxuw
__m256i _mm256_maskz_max_epu16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_max_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaxuw
__m256i _mm256_max_epu16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_max_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 15
i := j*16
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpmaxuw
__m512i _mm512_mask_max_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_max_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxuw
__m512i _mm512_maskz_max_epu16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_max_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmaxuw
__m512i _mm512_max_epu16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_max_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 31
i := j*16
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxud
__m128i _mm_mask_max_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_max_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxud
__m128i _mm_maskz_max_epu32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_max_epu32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmaxud
__m128i _mm_max_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_max_epu32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmaxud xmm, xmm
CPUID Flags: SSE4.1
Description
Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 3
i := j*32
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
Performance
vpmaxud
__m256i _mm256_mask_max_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_max_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxud
__m256i _mm256_maskz_max_epu32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_max_epu32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaxud
__m256i _mm256_max_epu32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_max_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxud ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*32
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpmaxud
__m512i _mm512_mask_max_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_max_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxud zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxud
__m512i _mm512_maskz_max_epu32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_max_epu32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxud zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmaxud
__m512i _mm512_max_epu32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_max_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxud zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 15
i := j*32
IF a[i+31:i] > b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxuq
__m128i _mm_mask_max_epu64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_max_epu64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxuq
__m128i _mm_maskz_max_epu64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_max_epu64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmaxuq
__m128i _mm_max_epu64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_max_epu64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 1
i := j*64
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxuq
__m256i _mm256_mask_max_epu64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_max_epu64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxuq
__m256i _mm256_maskz_max_epu64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_max_epu64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaxuq
__m256i _mm256_max_epu64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_max_epu64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 3
i := j*64
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxuq
__m512i _mm512_mask_max_epu64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_max_epu64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxuq
__m512i _mm512_maskz_max_epu64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_max_epu64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmaxuq
__m512i _mm512_max_epu64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_max_epu64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*64
IF a[i+63:i] > b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxub
__m128i _mm_mask_max_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_max_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmaxub
__m128i _mm_maskz_max_epu8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_max_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmaxub
__m128i _mm_max_epu8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_max_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmaxub xmm, xmm
CPUID Flags: SSE2
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 15
i := j*8
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
Performance
vpmaxub
__m256i _mm256_mask_max_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_max_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmaxub
__m256i _mm256_maskz_max_epu8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_max_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmaxub
__m256i _mm256_max_epu8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_max_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmaxub ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 31
i := j*8
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpmaxub
__m512i _mm512_mask_max_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_max_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmaxub
__m512i _mm512_maskz_max_epu8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_max_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmaxub
__m512i _mm512_max_epu8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_max_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmaxub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 63
i := j*8
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vmaxpd
__m128d _mm_mask_max_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_max_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmaxpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vmaxpd
__m128d _mm_maskz_max_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_max_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmaxpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
maxpd
__m128d _mm_max_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_max_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: maxpd xmm, xmm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
Performance
vmaxpd
__m256d _mm256_mask_max_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_max_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmaxpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vmaxpd
__m256d _mm256_maskz_max_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_max_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmaxpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmaxpd
__m256d _mm256_max_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_max_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmaxpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
vmaxpd
__m512d _mm512_mask_max_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_max_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmaxpd
__m512d _mm512_maskz_max_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_max_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmaxpd
__m512d _mm512_max_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_max_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
pmaxsw
__m64 _mm_max_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_max_pi16 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmaxsw mm, mm
CPUID Flags: SSE
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 3
i := j*16
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
Performance
vmaxps
__m128 _mm_mask_max_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_max_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmaxps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmaxps
__m128 _mm_maskz_max_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_max_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmaxps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
maxps
__m128 _mm_max_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_max_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: maxps xmm, xmm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR
Performance
vmaxps
__m256 _mm256_mask_max_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_max_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmaxps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmaxps
__m256 _mm256_maskz_max_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_max_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmaxps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmaxps
__m256 _mm256_max_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_max_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmaxps ymm, ymm, ymm
CPUID Flags: AVX
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vmaxps
__m512 _mm512_mask_max_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_max_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmaxps
__m512 _mm512_maskz_max_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_max_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmaxps
__m512 _mm512_max_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_max_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
pmaxub
__m64 _mm_max_pu8 (__m64 a, __m64 b)
Synopsis
__m64 _mm_max_pu8 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmaxub mm, mm
CPUID Flags: SSE
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*8
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
Performance
vmaxpd
__m512d _mm512_mask_max_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int sae)
Synopsis
__m512d _mm512_mask_max_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmaxpd
__m512d _mm512_maskz_max_round_pd (__mmask8 k, __m512d a, __m512d b, int sae)
Synopsis
__m512d _mm512_maskz_max_round_pd (__mmask8 k, __m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmaxpd
__m512d _mm512_max_round_pd (__m512d a, __m512d b, int sae)
Synopsis
__m512d _mm512_max_round_pd (__m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vmaxpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := MAX(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
vmaxps
__m512 _mm512_mask_max_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int sae)
Synopsis
__m512 _mm512_mask_max_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmaxps
__m512 _mm512_maskz_max_round_ps (__mmask16 k, __m512 a, __m512 b, int sae)
Synopsis
__m512 _mm512_maskz_max_round_ps (__mmask16 k, __m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmaxps
__m512 _mm512_max_round_ps (__m512 a, __m512 b, int sae)
Synopsis
__m512 _mm512_max_round_ps (__m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vmaxps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vmaxsd
__m128d _mm_mask_max_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int sae)
Synopsis
__m128d _mm_mask_max_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
IF k[0]
dst[63:0] := MAX(a[63:0], b[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vmaxsd
__m128d _mm_maskz_max_round_sd (__mmask8 k, __m128d a, __m128d b, int sae)
Synopsis
__m128d _mm_maskz_max_round_sd (__mmask8 k, __m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
IF k[0]
dst[63:0] := MAX(a[63:0], b[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vmaxsd
__m128d _mm_max_round_sd (__m128d a, __m128d b, int sae)
Synopsis
__m128d _mm_max_round_sd (__m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
dst[63:0] := MAX(a[63:0], b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vmaxss
__m128 _mm_mask_max_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int sae)
Synopsis
__m128 _mm_mask_max_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
IF k[0]
dst[31:0] := MAX(a[31:0], b[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vmaxss
__m128 _mm_maskz_max_round_ss (__mmask8 k, __m128 a, __m128 b, int sae)
Synopsis
__m128 _mm_maskz_max_round_ss (__mmask8 k, __m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
IF k[0]
dst[31:0] := MAX(a[31:0], b[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vmaxss
__m128 _mm_max_round_ss (__m128 a, __m128 b, int sae)
Synopsis
__m128 _mm_max_round_ss (__m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
dst[31:0] := MAX(a[31:0], b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vmaxsd
__m128d _mm_mask_max_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_max_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := MAX(a[63:0], b[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vmaxsd
__m128d _mm_maskz_max_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_max_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmaxsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := MAX(a[63:0], b[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
maxsd
__m128d _mm_max_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_max_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: maxsd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := MAX(a[63:0], b[63:0])
dst[127:64] := a[127:64]
Performance
vmaxss
__m128 _mm_mask_max_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_max_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[31:0] := MAX(a[31:0], b[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vmaxss
__m128 _mm_maskz_max_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_max_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmaxss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[31:0] := MAX(a[31:0], b[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
maxss
__m128 _mm_max_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_max_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: maxss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[31:0] := MAX(a[31:0], b[31:0])
dst[127:32] := a[127:32]
Performance
vgmaxabsps
__m512 _mm512_mask_maxabs_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_maxabs_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxabsps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vgmaxabsps
__m512 _mm512_maxabs_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_maxabs_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vgmaxabsps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Determines the maximum of the absolute elements of each pair of corresponding elements of packed single-precision (32-bit) floating-point elements in a and b, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := FpMax(Abs(a[i+31:i]), Abs(b[i+31:i]))
ENDFOR
dst[MAX:512] := 0
...
int _may_i_use_cpu_feature (unsigned __int64 a)
Synopsis
int _may_i_use_cpu_feature (unsigned __int64 a)
#include "immintrin.h"
Description
Dynamically query the processor to determine if the processor-specific feature(s) specified in a are available, and return true or false (1 or 0) if the set of features is available. Multiple features may be OR'd together. This intrinsic does not check the processor vendor. See the valid feature flags below:
Operation
_FEATURE_GENERIC_IA32
_FEATURE_FPU
_FEATURE_CMOV
_FEATURE_MMX
_FEATURE_FXSAVE
_FEATURE_SSE
_FEATURE_SSE2
_FEATURE_SSE3
_FEATURE_SSSE3
_FEATURE_SSE4_1
_FEATURE_SSE4_2
_FEATURE_MOVBE
_FEATURE_POPCNT
_FEATURE_PCLMULQDQ
_FEATURE_AES
_FEATURE_F16C
_FEATURE_AVX
_FEATURE_RDRND
_FEATURE_FMA
_FEATURE_BMI
_FEATURE_LZCNT
_FEATURE_HLE
_FEATURE_RTM
_FEATURE_AVX2
_FEATURE_KNCNI
_FEATURE_AVX512F
_FEATURE_ADX
_FEATURE_RDSEED
_FEATURE_AVX512ER
_FEATURE_AVX512PF
_FEATURE_AVX512CD
_FEATURE_SHA
_FEATURE_MPX
mfence
void _mm_mfence (void)
Synopsis
void _mm_mfence (void)
#include "emmintrin.h"
Instruction: mfence
CPUID Flags: SSE2
Description
Perform a serializing operation on all load-from-memory and store-to-memory instructions that were issued prior to this instruction. Guarantees that every memory access that precedes, in program order, the memory fence instruction is globally visible before any memory instruction which follows the fence in program order.
Performance
vpminsw
__m128i _mm_mask_min_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_min_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminsw
__m128i _mm_maskz_min_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_min_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pminsw
__m128i _mm_min_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_min_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pminsw xmm, xmm
CPUID Flags: SSE2
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*16
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
Performance
vpminsw
__m256i _mm256_mask_min_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_min_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminsw
__m256i _mm256_maskz_min_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_min_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpminsw
__m256i _mm256_min_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_min_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 15
i := j*16
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpminsw
__m512i _mm512_mask_min_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_min_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminsw
__m512i _mm512_maskz_min_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_min_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpminsw
__m512i _mm512_min_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_min_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsw
CPUID Flags: AVX512BW
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 31
i := j*16
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminsd
__m128i _mm_mask_min_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_min_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminsd
__m128i _mm_maskz_min_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_min_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pminsd
__m128i _mm_min_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_min_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pminsd xmm, xmm
CPUID Flags: SSE4.1
Description
Compare packed 32-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 3
i := j*32
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
Performance
vpminsd
__m256i _mm256_mask_min_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_min_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminsd
__m256i _mm256_maskz_min_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_min_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpminsd
__m256i _mm256_min_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_min_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 32-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*32
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpminsd
__m512i _mm512_mask_min_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_min_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminsd
__m512i _mm512_maskz_min_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_min_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpminsd
__m512i _mm512_min_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_min_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed 32-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 15
i := j*32
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminsq
__m128i _mm_mask_min_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_min_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminsq
__m128i _mm_maskz_min_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_min_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpminsq
__m128i _mm_min_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_min_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 1
i := j*64
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminsq
__m256i _mm256_mask_min_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_min_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminsq
__m256i _mm256_maskz_min_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_min_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpminsq
__m256i _mm256_min_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_min_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 3
i := j*64
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminsq
__m512i _mm512_mask_min_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_min_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminsq
__m512i _mm512_maskz_min_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_min_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpminsq
__m512i _mm512_min_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_min_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed 64-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*64
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminsb
__m128i _mm_mask_min_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_min_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminsb
__m128i _mm_maskz_min_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_min_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pminsb
__m128i _mm_min_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_min_epi8 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pminsb xmm, xmm
CPUID Flags: SSE4.1
Description
Compare packed 8-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 15
i := j*8
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
Performance
vpminsb
__m256i _mm256_mask_min_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_min_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminsb
__m256i _mm256_maskz_min_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_min_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpminsb
__m256i _mm256_min_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_min_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminsb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed 8-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 31
i := j*8
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpminsb
__m512i _mm512_mask_min_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_min_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminsb
__m512i _mm512_maskz_min_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_min_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpminsb
__m512i _mm512_min_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_min_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminsb
CPUID Flags: AVX512BW
Description
Compare packed 8-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 63
i := j*8
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminuw
__m128i _mm_mask_min_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_min_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminuw
__m128i _mm_maskz_min_epu16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_min_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pminuw
__m128i _mm_min_epu16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_min_epu16 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pminuw xmm, xmm
CPUID Flags: SSE4.1
Description
Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*16
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
Performance
vpminuw
__m256i _mm256_mask_min_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_min_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminuw
__m256i _mm256_maskz_min_epu16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_min_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpminuw
__m256i _mm256_min_epu16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_min_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 15
i := j*16
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpminuw
__m512i _mm512_mask_min_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_min_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminuw
__m512i _mm512_maskz_min_epu16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_min_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpminuw
__m512i _mm512_min_epu16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_min_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuw
CPUID Flags: AVX512BW
Description
Compare packed unsigned 16-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 31
i := j*16
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminud
__m128i _mm_mask_min_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_min_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminud
__m128i _mm_maskz_min_epu32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_min_epu32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pminud
__m128i _mm_min_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_min_epu32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pminud xmm, xmm
CPUID Flags: SSE4.1
Description
Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 3
i := j*32
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
Performance
vpminud
__m256i _mm256_mask_min_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_min_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminud
__m256i _mm256_maskz_min_epu32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_min_epu32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminud
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpminud
__m256i _mm256_min_epu32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_min_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminud ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*32
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpminud
__m512i _mm512_mask_min_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_min_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminud zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminud
__m512i _mm512_maskz_min_epu32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_min_epu32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminud zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpminud
__m512i _mm512_min_epu32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_min_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminud zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 15
i := j*32
IF a[i+31:i] < b[i+31:i]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := b[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminuq
__m128i _mm_mask_min_epu64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_min_epu64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminuq
__m128i _mm_maskz_min_epu64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_min_epu64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpminuq
__m128i _mm_min_epu64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_min_epu64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 1
i := j*64
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminuq
__m256i _mm256_mask_min_epu64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_min_epu64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminuq
__m256i _mm256_maskz_min_epu64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_min_epu64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpminuq
__m256i _mm256_min_epu64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_min_epu64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminuq
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 3
i := j*64
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminuq
__m512i _mm512_mask_min_epu64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_min_epu64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminuq
__m512i _mm512_maskz_min_epu64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_min_epu64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpminuq
__m512i _mm512_min_epu64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_min_epu64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminuq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*64
IF a[i+63:i] < b[i+63:i]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := b[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminub
__m128i _mm_mask_min_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_min_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpminub
__m128i _mm_maskz_min_epu8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_min_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pminub
__m128i _mm_min_epu8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_min_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pminub xmm, xmm
CPUID Flags: SSE2
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 15
i := j*8
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
Performance
vpminub
__m256i _mm256_mask_min_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_min_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpminub
__m256i _mm256_maskz_min_epu8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_min_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512VL + AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpminub
__m256i _mm256_min_epu8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_min_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpminub ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 31
i := j*8
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpminub
__m512i _mm512_mask_min_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_min_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpminub
__m512i _mm512_maskz_min_epu8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_min_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpminub
__m512i _mm512_min_epu8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_min_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpminub
CPUID Flags: AVX512BW
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 63
i := j*8
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vminpd
__m128d _mm_mask_min_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_min_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vminpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vminpd
__m128d _mm_maskz_min_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_min_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vminpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
minpd
__m128d _mm_min_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_min_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: minpd xmm, xmm
CPUID Flags: SSE2
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
Performance
vminpd
__m256d _mm256_mask_min_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_min_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vminpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vminpd
__m256d _mm256_maskz_min_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_min_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vminpd
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vminpd
__m256d _mm256_min_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_min_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vminpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
vminpd
__m512d _mm512_mask_min_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_min_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vminpd
__m512d _mm512_maskz_min_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_min_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vminpd
__m512d _mm512_min_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_min_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
pminsw
__m64 _mm_min_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_min_pi16 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pminsw mm, mm
CPUID Flags: SSE
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 3
i := j*16
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
Performance
vminps
__m128 _mm_mask_min_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_min_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vminps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vminps
__m128 _mm_maskz_min_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_min_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vminps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
minps
__m128 _mm_min_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_min_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: minps xmm, xmm
CPUID Flags: SSE
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR
Performance
vminps
__m256 _mm256_mask_min_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_min_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vminps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vminps
__m256 _mm256_maskz_min_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_min_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vminps
CPUID Flags: AVX512VL + AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vminps
__m256 _mm256_min_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_min_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vminps ymm, ymm, ymm
CPUID Flags: AVX
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vminps
__m512 _mm512_mask_min_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_min_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vminps
__m512 _mm512_maskz_min_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_min_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vminps
__m512 _mm512_min_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_min_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
pminub
__m64 _mm_min_pu8 (__m64 a, __m64 b)
Synopsis
__m64 _mm_min_pu8 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pminub mm, mm
CPUID Flags: SSE
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*8
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
Performance
vminpd
__m512d _mm512_mask_min_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int sae)
Synopsis
__m512d _mm512_mask_min_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vminpd
__m512d _mm512_maskz_min_round_pd (__mmask8 k, __m512d a, __m512d b, int sae)
Synopsis
__m512d _mm512_maskz_min_round_pd (__mmask8 k, __m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vminpd
__m512d _mm512_min_round_pd (__m512d a, __m512d b, int sae)
Synopsis
__m512d _mm512_min_round_pd (__m512d a, __m512d b, int sae)
#include "immintrin.h"
Instruction: vminpd zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := MIN(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
vminps
__m512 _mm512_mask_min_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int sae)
Synopsis
__m512 _mm512_mask_min_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vminps
__m512 _mm512_maskz_min_round_ps (__mmask16 k, __m512 a, __m512 b, int sae)
Synopsis
__m512 _mm512_maskz_min_round_ps (__mmask16 k, __m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vminps
__m512 _mm512_min_round_ps (__m512 a, __m512 b, int sae)
Synopsis
__m512 _mm512_min_round_ps (__m512 a, __m512 b, int sae)
#include "immintrin.h"
Instruction: vminps zmm {k}, zmm, zmm {sae}
CPUID Flags: AVX512F
Description
Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vminsd
__m128d _mm_mask_min_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int sae)
Synopsis
__m128d _mm_mask_min_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
IF k[0]
dst[63:0] := MIN(a[63:0], b[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vminsd
__m128d _mm_maskz_min_round_sd (__mmask8 k, __m128d a, __m128d b, int sae)
Synopsis
__m128d _mm_maskz_min_round_sd (__mmask8 k, __m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
IF k[0]
dst[63:0] := MIN(a[63:0], b[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vminsd
__m128d _mm_min_round_sd (__m128d a, __m128d b, int sae)
Synopsis
__m128d _mm_min_round_sd (__m128d a, __m128d b, int sae)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
dst[63:0] := MIN(a[63:0], b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vminss
__m128 _mm_mask_min_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int sae)
Synopsis
__m128 _mm_mask_min_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
IF k[0]
dst[31:0] := MIN(a[31:0], b[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vminss
__m128 _mm_maskz_min_round_ss (__mmask8 k, __m128 a, __m128 b, int sae)
Synopsis
__m128 _mm_maskz_min_round_ss (__mmask8 k, __m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
IF k[0]
dst[31:0] := MIN(a[31:0], b[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vminss
__m128 _mm_min_round_ss (__m128 a, __m128 b, int sae)
Synopsis
__m128 _mm_min_round_ss (__m128 a, __m128 b, int sae)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm {sae}
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
Pass __MM_FROUND_NO_EXC to sae to suppress all exceptions.
Operation
dst[31:0] := MIN(a[31:0], b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vminsd
__m128d _mm_mask_min_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_min_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := MIN(a[63:0], b[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vminsd
__m128d _mm_maskz_min_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_min_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vminsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := MIN(a[63:0], b[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
minsd
__m128d _mm_min_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_min_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: minsd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := MIN(a[63:0], b[63:0])
dst[127:64] := a[127:64]
Performance
vminss
__m128 _mm_mask_min_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_min_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[31:0] := MIN(a[31:0], b[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vminss
__m128 _mm_maskz_min_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_min_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vminss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[31:0] := MIN(a[31:0], b[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
minss
__m128 _mm_min_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_min_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: minss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[31:0] := MIN(a[31:0], b[31:0])
dst[127:32] := a[127:32]
Performance
phminposuw
__m128i _mm_minpos_epu16 (__m128i a)
Synopsis
__m128i _mm_minpos_epu16 (__m128i a)
#include "smmintrin.h"
Instruction: phminposuw xmm, xmm
CPUID Flags: SSE4.1
Description
Horizontally compute the minimum amongst the packed unsigned 16-bit integers in a, store the minimum and index in dst, and zero the remaining bits in dst.
Operation
index[2:0] := 0
min[15:0] := a[15:0]
FOR j := 0 to 7
i := j*16
IF a[i+15:i] < min[15:0]
index[2:0] := j
min[15:0] := a[i+15:i]
FI
ENDFOR
dst[15:0] := min[15:0]
dst[18:16] := index[2:0]
dst[127:19] := 0
Performance
monitor
void _mm_monitor (void const* p, unsigned extensions, unsigned hints)
Synopsis
void _mm_monitor (void const* p, unsigned extensions, unsigned hints)
#include "pmmintrin.h"
Instruction: monitor
CPUID Flags: MONITOR
Description
Arm address monitoring hardware using the address specified in p. A store to an address within the specified address range triggers the monitoring hardware. Specify optional extensions in extensions, and optional hints in hints.
vmovdqu16
__m128i _mm_mask_mov_epi16 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_mov_epi16 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu16
__m128i _mm_maskz_mov_epi16 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_mov_epi16 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu16
__m256i _mm256_mask_mov_epi16 (__m256i src, __mmask16 k, __m256i a)
Synopsis
__m256i _mm256_mask_mov_epi16 (__m256i src, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu16
__m256i _mm256_maskz_mov_epi16 (__mmask16 k, __m256i a)
Synopsis
__m256i _mm256_maskz_mov_epi16 (__mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu16
__m512i _mm512_mask_mov_epi16 (__m512i src, __mmask32 k, __m512i a)
Synopsis
__m512i _mm512_mask_mov_epi16 (__m512i src, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW
Description
Move packed 16-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu16
__m512i _mm512_maskz_mov_epi16 (__mmask32 k, __m512i a)
Synopsis
__m512i _mm512_maskz_mov_epi16 (__mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW
Description
Move packed 16-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovdqa32
__m128i _mm_mask_mov_epi32 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_mov_epi32 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqa32
__m128i _mm_maskz_mov_epi32 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_mov_epi32 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqa32
__m256i _mm256_mask_mov_epi32 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_mov_epi32 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqa32
__m256i _mm256_maskz_mov_epi32 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_mov_epi32 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqa32
__m512i _mm512_mask_mov_epi32 (__m512i src, __mmask16 k, __m512i a)
Synopsis
__m512i _mm512_mask_mov_epi32 (__m512i src, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqa32
__m512i _mm512_maskz_mov_epi32 (__mmask16 k, __m512i a)
Synopsis
__m512i _mm512_maskz_mov_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 zmm {k}, zmm
CPUID Flags: AVX512F
Description
Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovdqa64
__m128i _mm_mask_mov_epi64 (__m128i src, __mmask8 k, __m128i a)
Synopsis
__m128i _mm_mask_mov_epi64 (__m128i src, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqa64
__m128i _mm_maskz_mov_epi64 (__mmask8 k, __m128i a)
Synopsis
__m128i _mm_maskz_mov_epi64 (__mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqa64
__m256i _mm256_mask_mov_epi64 (__m256i src, __mmask8 k, __m256i a)
Synopsis
__m256i _mm256_mask_mov_epi64 (__m256i src, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqa64
__m256i _mm256_maskz_mov_epi64 (__mmask8 k, __m256i a)
Synopsis
__m256i _mm256_maskz_mov_epi64 (__mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqa64
__m512i _mm512_mask_mov_epi64 (__m512i src, __mmask8 k, __m512i a)
Synopsis
__m512i _mm512_mask_mov_epi64 (__m512i src, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqa64
__m512i _mm512_maskz_mov_epi64 (__mmask8 k, __m512i a)
Synopsis
__m512i _mm512_maskz_mov_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa64 zmm {k}, zmm
CPUID Flags: AVX512F
Description
Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu8
__m128i _mm_mask_mov_epi8 (__m128i src, __mmask16 k, __m128i a)
Synopsis
__m128i _mm_mask_mov_epi8 (__m128i src, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu8
__m128i _mm_maskz_mov_epi8 (__mmask16 k, __m128i a)
Synopsis
__m128i _mm_maskz_mov_epi8 (__mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu8
__m256i _mm256_mask_mov_epi8 (__m256i src, __mmask32 k, __m256i a)
Synopsis
__m256i _mm256_mask_mov_epi8 (__m256i src, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu8
__m256i _mm256_maskz_mov_epi8 (__mmask32 k, __m256i a)
Synopsis
__m256i _mm256_maskz_mov_epi8 (__mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu8
__m512i _mm512_mask_mov_epi8 (__m512i src, __mmask64 k, __m512i a)
Synopsis
__m512i _mm512_mask_mov_epi8 (__m512i src, __mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW
Description
Move packed 8-bit integers from a into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu8
__m512i _mm512_maskz_mov_epi8 (__mmask64 k, __m512i a)
Synopsis
__m512i _mm512_maskz_mov_epi8 (__mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW
Description
Move packed 8-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovapd
__m128d _mm_mask_mov_pd (__m128d src, __mmask8 k, __m128d a)
Synopsis
__m128d _mm_mask_mov_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovapd
__m128d _mm_maskz_mov_pd (__mmask8 k, __m128d a)
Synopsis
__m128d _mm_maskz_mov_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovapd
__m256d _mm256_mask_mov_pd (__m256d src, __mmask8 k, __m256d a)
Synopsis
__m256d _mm256_mask_mov_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovapd
__m256d _mm256_maskz_mov_pd (__mmask8 k, __m256d a)
Synopsis
__m256d _mm256_maskz_mov_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovapd
__m512d _mm512_mask_mov_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_mov_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovapd
__m512d _mm512_maskz_mov_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_mov_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovapd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovaps
__m128 _mm_mask_mov_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_mov_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovaps
__m128 _mm_maskz_mov_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_mov_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vmovaps
__m256 _mm256_mask_mov_ps (__m256 src, __mmask8 k, __m256 a)
Synopsis
__m256 _mm256_mask_mov_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovaps
__m256 _mm256_maskz_mov_ps (__mmask8 k, __m256 a)
Synopsis
__m256 _mm256_maskz_mov_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovaps
__m512 _mm512_mask_mov_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_mov_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovaps
__m512 _mm512_maskz_mov_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_mov_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovaps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
movq
__m128i _mm_move_epi64 (__m128i a)
Synopsis
__m128i _mm_move_epi64 (__m128i a)
#include "emmintrin.h"
Instruction: movq xmm, xmm
CPUID Flags: SSE2
Description
Copy the lower 64-bit integer in a to the lower element of dst, and zero the upper element.
Operation
dst[63:0] := a[63:0]
dst[127:64] := 0
Performance
vmovsd
__m128d _mm_mask_move_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_move_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmovsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vmovsd
__m128d _mm_maskz_move_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_move_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmovsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
movsd
__m128d _mm_move_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_move_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: movsd xmm, xmm, xmm
CPUID Flags: SSE2
Description
Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := b[63:0]
dst[127:64] := a[127:64]
Performance
vmovss
__m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_move_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmovss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vmovss
__m128 _mm_maskz_move_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_move_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmovss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
movss
__m128 _mm_move_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_move_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: movss xmm, xmm
CPUID Flags: SSE
Description
Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst, and copy the upper 3 elements from a to the upper elements of dst.
Operation
dst[31:0] := b[31:0]
dst[63:32] := a[63:32]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
Performance
vmovddup
__m128d _mm_mask_movedup_pd (__m128d src, __mmask8 k, __m128d a)
Synopsis
__m128d _mm_mask_movedup_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovddup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovddup
__m128d _mm_maskz_movedup_pd (__mmask8 k, __m128d a)
Synopsis
__m128d _mm_maskz_movedup_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovddup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
movddup
__m128d _mm_movedup_pd (__m128d a)
Synopsis
__m128d _mm_movedup_pd (__m128d a)
#include "pmmintrin.h"
Instruction: movddup xmm, xmm
CPUID Flags: SSE3
Description
Duplicate the low double-precision (64-bit) floating-point element from a, and store the results in dst.
Operation
tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
Performance
vmovddup
__m256d _mm256_mask_movedup_pd (__m256d src, __mmask8 k, __m256d a)
Synopsis
__m256d _mm256_mask_movedup_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovddup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovddup
__m256d _mm256_maskz_movedup_pd (__mmask8 k, __m256d a)
Synopsis
__m256d _mm256_maskz_movedup_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovddup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovddup
__m256d _mm256_movedup_pd (__m256d a)
Synopsis
__m256d _mm256_movedup_pd (__m256d a)
#include "immintrin.h"
Instruction: vmovddup ymm, ymm
CPUID Flags: AVX
Description
Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
Operation
dst[63:0] := a[63:0]
dst[127:64] := a[63:0]
dst[191:128] := a[191:128]
dst[255:192] := a[191:128]
dst[MAX:256] := 0
Performance
vmovddup
__m512d _mm512_mask_movedup_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_movedup_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovddup zmm {k}, zmm
CPUID Flags: AVX512F
Description
Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
tmp[319:256] := a[319:256]
tmp[383:320] := a[319:256]
tmp[447:384] := a[447:384]
tmp[511:448] := a[447:384]
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovddup
__m512d _mm512_maskz_movedup_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_movedup_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovddup zmm {k}, zmm
CPUID Flags: AVX512F
Description
Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
tmp[319:256] := a[319:256]
tmp[383:320] := a[319:256]
tmp[447:384] := a[447:384]
tmp[511:448] := a[447:384]
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovddup
__m512d _mm512_movedup_pd (__m512d a)
Synopsis
__m512d _mm512_movedup_pd (__m512d a)
#include "immintrin.h"
Instruction: vmovddup zmm {k}, zmm
CPUID Flags: AVX512F
Description
Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
Operation
tmp[63:0] := a[63:0]
tmp[127:64] := a[63:0]
tmp[191:128] := a[191:128]
tmp[255:192] := a[191:128]
tmp[319:256] := a[319:256]
tmp[383:320] := a[319:256]
tmp[447:384] := a[447:384]
tmp[511:448] := a[447:384]
dst[MAX:512] := 0
vmovshdup
__m128 _mm_mask_movehdup_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_movehdup_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovshdup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovshdup
__m128 _mm_maskz_movehdup_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_movehdup_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovshdup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
movshdup
__m128 _mm_movehdup_ps (__m128 a)
Synopsis
__m128 _mm_movehdup_ps (__m128 a)
#include "pmmintrin.h"
Instruction: movshdup xmm, xmm
CPUID Flags: SSE3
Description
Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
Operation
dst[31:0] := a[63:32]
dst[63:32] := a[63:32]
dst[95:64] := a[127:96]
dst[127:96] := a[127:96]
Performance
vmovshdup
__m256 _mm256_mask_movehdup_ps (__m256 src, __mmask8 k, __m256 a)
Synopsis
__m256 _mm256_mask_movehdup_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovshdup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovshdup
__m256 _mm256_maskz_movehdup_ps (__mmask8 k, __m256 a)
Synopsis
__m256 _mm256_maskz_movehdup_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovshdup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovshdup
__m256 _mm256_movehdup_ps (__m256 a)
Synopsis
__m256 _mm256_movehdup_ps (__m256 a)
#include "immintrin.h"
Instruction: vmovshdup ymm, ymm
CPUID Flags: AVX
Description
Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
Operation
dst[31:0] := a[63:32]
dst[63:32] := a[63:32]
dst[95:64] := a[127:96]
dst[127:96] := a[127:96]
dst[159:128] := a[191:160]
dst[191:160] := a[191:160]
dst[223:192] := a[255:224]
dst[255:224] := a[255:224]
dst[MAX:256] := 0
Performance
vmovshdup
__m512 _mm512_mask_movehdup_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_movehdup_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovshdup zmm {k}, zmm
CPUID Flags: AVX512F
Description
Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
tmp[287:256] := a[319:288]
tmp[319:288] := a[319:288]
tmp[351:320] := a[383:352]
tmp[383:352] := a[383:352]
tmp[415:384] := a[447:416]
tmp[447:416] := a[447:416]
tmp[479:448] := a[511:480]
tmp[511:480] := a[511:480]
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovshdup
__m512 _mm512_maskz_movehdup_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_movehdup_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovshdup zmm {k}, zmm
CPUID Flags: AVX512F
Description
Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[63:32]
tmp[63:32] := a[63:32]
tmp[95:64] := a[127:96]
tmp[127:96] := a[127:96]
tmp[159:128] := a[191:160]
tmp[191:160] := a[191:160]
tmp[223:192] := a[255:224]
tmp[255:224] := a[255:224]
tmp[287:256] := a[319:288]
tmp[319:288] := a[319:288]
tmp[351:320] := a[383:352]
tmp[383:352] := a[383:352]
tmp[415:384] := a[447:416]
tmp[447:416] := a[447:416]
tmp[479:448] := a[511:480]
tmp[511:480] := a[511:480]
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovshdup
__m512 _mm512_movehdup_ps (__m512 a)
Synopsis
__m512 _mm512_movehdup_ps (__m512 a)
#include "immintrin.h"
Instruction: vmovshdup zmm {k}, zmm
CPUID Flags: AVX512F
Description
Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
Operation
dst[31:0] := a[63:32]
dst[63:32] := a[63:32]
dst[95:64] := a[127:96]
dst[127:96] := a[127:96]
dst[159:128] := a[191:160]
dst[191:160] := a[191:160]
dst[223:192] := a[255:224]
dst[255:224] := a[255:224]
dst[287:256] := a[319:288]
dst[319:288] := a[319:288]
dst[351:320] := a[383:352]
dst[383:352] := a[383:352]
dst[415:384] := a[447:416]
dst[447:416] := a[447:416]
dst[479:448] := a[511:480]
dst[511:480] := a[511:480]
dst[MAX:512] := 0
movhlps
__m128 _mm_movehl_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_movehl_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: movhlps xmm, xmm
CPUID Flags: SSE
Description
Move the upper 2 single-precision (32-bit) floating-point elements from b to the lower 2 elements of dst, and copy the upper 2 elements from a to the upper 2 elements of dst.
Operation
dst[31:0] := b[95:64]
dst[63:32] := b[127:96]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]
Performance
vmovsldup
__m128 _mm_mask_moveldup_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_moveldup_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovsldup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovsldup
__m128 _mm_maskz_moveldup_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_moveldup_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovsldup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
movsldup
__m128 _mm_moveldup_ps (__m128 a)
Synopsis
__m128 _mm_moveldup_ps (__m128 a)
#include "pmmintrin.h"
Instruction: movsldup xmm, xmm
CPUID Flags: SSE3
Description
Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
Operation
dst[31:0] := a[31:0]
dst[63:32] := a[31:0]
dst[95:64] := a[95:64]
dst[127:96] := a[95:64]
Performance
vmovsldup
__m256 _mm256_mask_moveldup_ps (__m256 src, __mmask8 k, __m256 a)
Synopsis
__m256 _mm256_mask_moveldup_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovsldup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovsldup
__m256 _mm256_maskz_moveldup_ps (__mmask8 k, __m256 a)
Synopsis
__m256 _mm256_maskz_moveldup_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovsldup
CPUID Flags: AVX512VL + AVX512F
Description
Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmovsldup
__m256 _mm256_moveldup_ps (__m256 a)
Synopsis
__m256 _mm256_moveldup_ps (__m256 a)
#include "immintrin.h"
Instruction: vmovsldup ymm, ymm
CPUID Flags: AVX
Description
Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
Operation
dst[31:0] := a[31:0]
dst[63:32] := a[31:0]
dst[95:64] := a[95:64]
dst[127:96] := a[95:64]
dst[159:128] := a[159:128]
dst[191:160] := a[159:128]
dst[223:192] := a[223:192]
dst[255:224] := a[223:192]
dst[MAX:256] := 0
Performance
vmovsldup
__m512 _mm512_mask_moveldup_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_moveldup_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovsldup zmm {k}, zmm
CPUID Flags: AVX512F
Description
Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
tmp[287:256] := a[287:256]
tmp[319:288] := a[287:256]
tmp[351:320] := a[351:320]
tmp[383:352] := a[351:320]
tmp[415:384] := a[415:384]
tmp[447:416] := a[415:384]
tmp[479:448] := a[479:448]
tmp[511:480] := a[479:448]
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovsldup
__m512 _mm512_maskz_moveldup_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_moveldup_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovsldup zmm {k}, zmm
CPUID Flags: AVX512F
Description
Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp[31:0] := a[31:0]
tmp[63:32] := a[31:0]
tmp[95:64] := a[95:64]
tmp[127:96] := a[95:64]
tmp[159:128] := a[159:128]
tmp[191:160] := a[159:128]
tmp[223:192] := a[223:192]
tmp[255:224] := a[223:192]
tmp[287:256] := a[287:256]
tmp[319:288] := a[287:256]
tmp[351:320] := a[351:320]
tmp[383:352] := a[351:320]
tmp[415:384] := a[415:384]
tmp[447:416] := a[415:384]
tmp[479:448] := a[479:448]
tmp[511:480] := a[479:448]
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmovsldup
__m512 _mm512_moveldup_ps (__m512 a)
Synopsis
__m512 _mm512_moveldup_ps (__m512 a)
#include "immintrin.h"
Instruction: vmovsldup zmm {k}, zmm
CPUID Flags: AVX512F
Description
Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
Operation
dst[31:0] := a[31:0]
dst[63:32] := a[31:0]
dst[95:64] := a[95:64]
dst[127:96] := a[95:64]
dst[159:128] := a[159:128]
dst[191:160] := a[159:128]
dst[223:192] := a[223:192]
dst[255:224] := a[223:192]
dst[287:256] := a[287:256]
dst[319:288] := a[287:256]
dst[351:320] := a[351:320]
dst[383:352] := a[351:320]
dst[415:384] := a[415:384]
dst[447:416] := a[415:384]
dst[479:448] := a[479:448]
dst[511:480] := a[479:448]
dst[MAX:512] := 0
movlhps
__m128 _mm_movelh_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_movelh_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: movlhps xmm, xmm
CPUID Flags: SSE
Description
Move the lower 2 single-precision (32-bit) floating-point elements from b to the upper 2 elements of dst, and copy the lower 2 elements from a to the lower 2 elements of dst.
Operation
dst[31:0] := a[31:0]
dst[63:32] := a[63:32]
dst[95:64] := b[31:0]
dst[127:96] := b[63:32]
Performance
pmovmskb
int _mm_movemask_epi8 (__m128i a)
Synopsis
int _mm_movemask_epi8 (__m128i a)
#include "emmintrin.h"
Instruction: pmovmskb r32, xmm
CPUID Flags: SSE2
Description
Create mask from the most significant bit of each 8-bit element in a, and store the result in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[j] := a[i+7]
ENDFOR
dst[MAX:16] := 0
Performance
vpmovmskb
int _mm256_movemask_epi8 (__m256i a)
Synopsis
int _mm256_movemask_epi8 (__m256i a)
#include "immintrin.h"
Instruction: vpmovmskb r32, ymm
CPUID Flags: AVX2
Description
Create mask from the most significant bit of each 8-bit element in a, and store the result in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[j] := a[i+7]
ENDFOR
Performance
movmskpd
int _mm_movemask_pd (__m128d a)
Synopsis
int _mm_movemask_pd (__m128d a)
#include "emmintrin.h"
Instruction: movmskpd r32, xmm
CPUID Flags: SSE2
Description
Set each bit of mask dst based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in a.
Operation
FOR j := 0 to 1
i := j*64
IF a[i+63]
dst[j] := 1
ELSE
dst[j] := 0
FI
ENDFOR
dst[MAX:2] := 0
vmovmskpd
int _mm256_movemask_pd (__m256d a)
Synopsis
int _mm256_movemask_pd (__m256d a)
#include "immintrin.h"
Instruction: vmovmskpd r32, ymm
CPUID Flags: AVX
Description
Set each bit of mask dst based on the most significant bit of the corresponding packed double-precision (64-bit) floating-point element in a.
Operation
FOR j := 0 to 3
i := j*64
IF a[i+63]
dst[j] := 1
ELSE
dst[j] := 0
FI
ENDFOR
dst[MAX:4] := 0
Performance
pmovmskb
int _mm_movemask_pi8 (__m64 a)
Synopsis
int _mm_movemask_pi8 (__m64 a)
#include "xmmintrin.h"
Instruction: pmovmskb r32, mm
CPUID Flags: SSE
Description
Create mask from the most significant bit of each 8-bit element in a, and store the result in dst.
Operation
FOR j := 0 to 7
i := j*8
dst[j] := a[i+7]
ENDFOR
dst[MAX:8] := 0
movmskps
int _mm_movemask_ps (__m128 a)
Synopsis
int _mm_movemask_ps (__m128 a)
#include "xmmintrin.h"
Instruction: movmskps r32, xmm
CPUID Flags: SSE
Description
Set each bit of mask dst based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in a.
Operation
FOR j := 0 to 3
i := j*32
IF a[i+31]
dst[j] := 1
ELSE
dst[j] := 0
FI
ENDFOR
dst[MAX:4] := 0
vmovmskps
int _mm256_movemask_ps (__m256 a)
Synopsis
int _mm256_movemask_ps (__m256 a)
#include "immintrin.h"
Instruction: vmovmskps r32, ymm
CPUID Flags: AVX
Description
Set each bit of mask dst based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in a.
Operation
FOR j := 0 to 7
i := j*32
IF a[i+31]
dst[j] := 1
ELSE
dst[j] := 0
FI
ENDFOR
dst[MAX:8] := 0
Performance
vpmovw2m
__mmask8 _mm_movepi16_mask (__m128i a)
Synopsis
__mmask8 _mm_movepi16_mask (__m128i a)
#include "immintrin.h"
Instruction: vpmovw2m
CPUID Flags: AVX512VL + AVX512BW
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
Operation
FOR j := 0 to 7
i := j*16
IF a[i+15]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpmovw2m
__mmask16 _mm256_movepi16_mask (__m256i a)
Synopsis
__mmask16 _mm256_movepi16_mask (__m256i a)
#include "immintrin.h"
Instruction: vpmovw2m
CPUID Flags: AVX512VL + AVX512BW
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
Operation
FOR j := 0 to 15
i := j*16
IF a[i+15]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpmovw2m
__mmask32 _mm512_movepi16_mask (__m512i a)
Synopsis
__mmask32 _mm512_movepi16_mask (__m512i a)
#include "immintrin.h"
Instruction: vpmovw2m
CPUID Flags: AVX512BW
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 16-bit integer in a.
Operation
FOR j := 0 to 31
i := j*16
IF a[i+15]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpmovd2m
__mmask8 _mm_movepi32_mask (__m128i a)
Synopsis
__mmask8 _mm_movepi32_mask (__m128i a)
#include "immintrin.h"
Instruction: vpmovd2m
CPUID Flags: AVX512VL + AVX512DQ
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit integer in a.
Operation
FOR j := 0 to 3
i := j*32
IF a[i+31]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpmovd2m
__mmask8 _mm256_movepi32_mask (__m256i a)
Synopsis
__mmask8 _mm256_movepi32_mask (__m256i a)
#include "immintrin.h"
Instruction: vpmovd2m
CPUID Flags: AVX512VL + AVX512DQ
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit integer in a.
Operation
FOR j := 0 to 7
i := j*32
IF a[i+31]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vpmovd2m
__mmask16 _mm512_movepi32_mask (__m512i a)
Synopsis
__mmask16 _mm512_movepi32_mask (__m512i a)
#include "immintrin.h"
Instruction: vpmovd2m
CPUID Flags: AVX512DQ
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 32-bit integer in a.
Operation
FOR j := 0 to 15
i := j*32
IF a[i+31]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpmovq2m
__mmask8 _mm_movepi64_mask (__m128i a)
Synopsis
__mmask8 _mm_movepi64_mask (__m128i a)
#include "immintrin.h"
Instruction: vpmovq2m
CPUID Flags: AVX512VL + AVX512DQ
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit integer in a.
Operation
FOR j := 0 to 1
i := j*64
IF a[i+63]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vpmovq2m
__mmask8 _mm256_movepi64_mask (__m256i a)
Synopsis
__mmask8 _mm256_movepi64_mask (__m256i a)
#include "immintrin.h"
Instruction: vpmovq2m
CPUID Flags: AVX512VL + AVX512DQ
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit integer in a.
Operation
FOR j := 0 to 3
i := j*64
IF a[i+63]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vpmovq2m
__mmask8 _mm512_movepi64_mask (__m512i a)
Synopsis
__mmask8 _mm512_movepi64_mask (__m512i a)
#include "immintrin.h"
Instruction: vpmovq2m
CPUID Flags: AVX512DQ
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 64-bit integer in a.
Operation
FOR j := 0 to 7
i := j*64
IF a[i+63]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
movdq2q
__m64 _mm_movepi64_pi64 (__m128i a)
Synopsis
__m64 _mm_movepi64_pi64 (__m128i a)
#include "emmintrin.h"
Instruction: movdq2q mm, xmm
CPUID Flags: SSE2
Description
Copy the lower 64-bit integer in a to dst.
Operation
dst[63:0] := a[63:0]
vpmovb2m
__mmask16 _mm_movepi8_mask (__m128i a)
Synopsis
__mmask16 _mm_movepi8_mask (__m128i a)
#include "immintrin.h"
Instruction: vpmovb2m
CPUID Flags: AVX512VL + AVX512BW
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
Operation
FOR j := 0 to 15
i := j*8
IF a[i+7]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vpmovb2m
__mmask32 _mm256_movepi8_mask (__m256i a)
Synopsis
__mmask32 _mm256_movepi8_mask (__m256i a)
#include "immintrin.h"
Instruction: vpmovb2m
CPUID Flags: AVX512VL + AVX512BW
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
Operation
FOR j := 0 to 31
i := j*8
IF a[i+7]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vpmovb2m
__mmask64 _mm512_movepi8_mask (__m512i a)
Synopsis
__mmask64 _mm512_movepi8_mask (__m512i a)
#include "immintrin.h"
Instruction: vpmovb2m
CPUID Flags: AVX512BW
Description
Set each bit of mask register k based on the most significant bit of the corresponding packed 8-bit integer in a.
Operation
FOR j := 0 to 63
i := j*8
IF a[i+7]
k[j] := 1
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vpmovm2w
__m128i _mm_movm_epi16 (__mmask8 k)
Synopsis
__m128i _mm_movm_epi16 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2w
CPUID Flags: AVX512VL + AVX512BW
Description
Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := 0xFFFF
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovm2w
__m256i _mm256_movm_epi16 (__mmask16 k)
Synopsis
__m256i _mm256_movm_epi16 (__mmask16 k)
#include "immintrin.h"
Instruction: vpmovm2w
CPUID Flags: AVX512VL + AVX512BW
Description
Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := 0xFFFF
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovm2w
__m512i _mm512_movm_epi16 (__mmask32 k)
Synopsis
__m512i _mm512_movm_epi16 (__mmask32 k)
#include "immintrin.h"
Instruction: vpmovm2w
CPUID Flags: AVX512BW
Description
Set each packed 16-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := 0xFFFF
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmovm2d
__m128i _mm_movm_epi32 (__mmask8 k)
Synopsis
__m128i _mm_movm_epi32 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2d
CPUID Flags: AVX512VL + AVX512DQ
Description
Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := 0xFFFFFFFF
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovm2d
__m256i _mm256_movm_epi32 (__mmask8 k)
Synopsis
__m256i _mm256_movm_epi32 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2d
CPUID Flags: AVX512VL + AVX512DQ
Description
Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := 0xFFFFFFFF
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovm2d
__m512i _mm512_movm_epi32 (__mmask16 k)
Synopsis
__m512i _mm512_movm_epi32 (__mmask16 k)
#include "immintrin.h"
Instruction: vpmovm2d
CPUID Flags: AVX512DQ
Description
Set each packed 32-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := 0xFFFFFFFF
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmovm2q
__m128i _mm_movm_epi64 (__mmask8 k)
Synopsis
__m128i _mm_movm_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2q
CPUID Flags: AVX512VL + AVX512DQ
Description
Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := 0xFFFFFFFFffffffff
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovm2q
__m256i _mm256_movm_epi64 (__mmask8 k)
Synopsis
__m256i _mm256_movm_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2q
CPUID Flags: AVX512VL + AVX512DQ
Description
Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := 0xFFFFFFFFffffffff
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovm2q
__m512i _mm512_movm_epi64 (__mmask8 k)
Synopsis
__m512i _mm512_movm_epi64 (__mmask8 k)
#include "immintrin.h"
Instruction: vpmovm2q
CPUID Flags: AVX512DQ
Description
Set each packed 64-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := 0xFFFFFFFFffffffff
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmovm2b
__m128i _mm_movm_epi8 (__mmask16 k)
Synopsis
__m128i _mm_movm_epi8 (__mmask16 k)
#include "immintrin.h"
Instruction: vpmovm2b
CPUID Flags: AVX512BW + AVX512VL
Description
Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := 0xFF
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmovm2b
__m256i _mm256_movm_epi8 (__mmask32 k)
Synopsis
__m256i _mm256_movm_epi8 (__mmask32 k)
#include "immintrin.h"
Instruction: vpmovm2b
CPUID Flags: AVX512VL + AVX512BW
Description
Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := 0xFF
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmovm2b
__m512i _mm512_movm_epi8 (__mmask64 k)
Synopsis
__m512i _mm512_movm_epi8 (__mmask64 k)
#include "immintrin.h"
Instruction: vpmovm2b
CPUID Flags: AVX512BW
Description
Set each packed 8-bit integer in dst to all ones or all zeros based on the value of the corresponding bit in k.
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := 0xFF
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
movq2dq
__m128i _mm_movpi64_epi64 (__m64 a)
Synopsis
__m128i _mm_movpi64_epi64 (__m64 a)
#include "emmintrin.h"
Instruction: movq2dq xmm, mm
CPUID Flags: SSE2
Description
Copy the 64-bit integer a to the lower element of dst, and zero the upper element.
Operation
dst[63:0] := a[63:0]
dst[127:64] := 0
mpsadbw
__m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8)
Synopsis
__m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8)
#include "smmintrin.h"
Instruction: mpsadbw xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst.
Eight SADs are performed using one quadruplet from b and eight quadruplets from a. One quadruplet is selected from b starting at on the offset specified in imm8. Eight quadruplets are formed from sequential 8-bit integers selected from a starting at the offset specified in imm8.
Operation
MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
a_offset := imm8[2]*32
b_offset := imm8[1:0]*32
FOR j := 0 to 7
i := j*8
k := a_offset+i
l := b_offset
tmp[i+15:i] := ABS(a[k+7:k] - b[l+7:l]) + ABS(a[k+15:k+8] - b[l+15:l+8]) + ABS(a[k+23:k+16] - b[l+23:l+16]) + ABS(a[k+31:k+24] - b[l+31:l+24])
ENDFOR
RETURN tmp[127:0]
}
dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
Performance
vmpsadbw
__m256i _mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vmpsadbw ymm, ymm, ymm, imm
CPUID Flags: AVX2
Description
Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in a compared to those in b, and store the 16-bit results in dst.
Eight SADs are performed for each 128-bit lane using one quadruplet from b and eight quadruplets from a. One quadruplet is selected from b starting at on the offset specified in imm8. Eight quadruplets are formed from sequential 8-bit integers selected from a starting at the offset specified in imm8.
Operation
MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
i := imm8[2]*32
b_offset := imm8[1:0]*32
FOR j := 0 to 7
i := j*8
k := a_offset+i
l := b_offset
tmp[i+15:i] := ABS(a[k+7:k] - b[l+7:l]) + ABS(a[k+15:k+8] - b[l+15:l+8]) + ABS(a[k+23:k+16] - b[l+23:l+16]) + ABS(a[k+31:k+24] - b[l+31:l+24])
ENDFOR
RETURN tmp[127:0]
}
dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3])
dst[MAX:256] := 0
Performance
vpmuldq
__m128i _mm_mask_mul_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_mul_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmuldq
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmuldq
__m128i _mm_maskz_mul_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_mul_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmuldq
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmuldq
__m128i _mm_mul_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_mul_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmuldq xmm, xmm
CPUID Flags: SSE4.1
Description
Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
Performance
vpmuldq
__m256i _mm256_mask_mul_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_mul_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuldq
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmuldq
__m256i _mm256_maskz_mul_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_mul_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuldq
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmuldq
__m256i _mm256_mul_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_mul_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuldq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpmuldq
__m512i _mm512_mask_mul_epi32 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mul_epi32 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmuldq
__m512i _mm512_maskz_mul_epi32 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_mul_epi32 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmuldq
__m512i _mm512_mul_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mul_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply the low 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vpmuludq
__m128i _mm_mask_mul_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_mul_epu32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmuludq
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmuludq
__m128i _mm_maskz_mul_epu32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_mul_epu32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmuludq
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmuludq
__m128i _mm_mul_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_mul_epu32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmuludq xmm, xmm
CPUID Flags: SSE2
Description
Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
Performance
vpmuludq
__m256i _mm256_mask_mul_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_mul_epu32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuludq
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmuludq
__m256i _mm256_maskz_mul_epu32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_mul_epu32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuludq
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmuludq
__m256i _mm256_mul_epu32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_mul_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmuludq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpmuludq
__m512i _mm512_mask_mul_epu32 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mul_epu32 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuludq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmuludq
__m512i _mm512_maskz_mul_epu32 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_mul_epu32 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuludq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmuludq
__m512i _mm512_mul_epu32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mul_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmuludq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vmulpd
__m128d _mm_mask_mul_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_mul_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmulpd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vmulpd
__m128d _mm_maskz_mul_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_mul_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmulpd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
mulpd
__m128d _mm_mul_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_mul_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: mulpd xmm, xmm
CPUID Flags: SSE2
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
Performance
vmulpd
__m256d _mm256_mask_mul_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_mul_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmulpd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vmulpd
__m256d _mm256_maskz_mul_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_mul_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmulpd
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmulpd
__m256d _mm256_mul_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_mul_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vmulpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vmulpd
__m512d _mm512_mask_mul_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_mul_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmulpd
__m512d _mm512_maskz_mul_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_mul_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmulpd
__m512d _mm512_mul_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_mul_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vmulps
__m128 _mm_mask_mul_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_mul_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmulps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vmulps
__m128 _mm_maskz_mul_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_mul_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmulps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
mulps
__m128 _mm_mul_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_mul_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: mulps xmm, xmm
CPUID Flags: SSE
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ENDFOR
Performance
vmulps
__m256 _mm256_mask_mul_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_mul_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmulps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vmulps
__m256 _mm256_maskz_mul_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_mul_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmulps
CPUID Flags: AVX512VL + AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vmulps
__m256 _mm256_mul_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_mul_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vmulps ymm, ymm, ymm
CPUID Flags: AVX
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vmulps
__m512 _mm512_mask_mul_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_mul_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). RM.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmulps
__m512 _mm512_maskz_mul_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_mul_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmulps
__m512 _mm512_mul_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_mul_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vmulpd
__m512d _mm512_mask_mul_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_mask_mul_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vmulpd
__m512d _mm512_maskz_mul_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_maskz_mul_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmulpd
__m512d _mm512_mul_round_pd (__m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_mul_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vmulpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed double-precision (64-bit) floating-point elements in
a and
b, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vmulps
__m512 _mm512_mask_mul_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_mask_mul_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vmulps
__m512 _mm512_maskz_mul_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_maskz_mul_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vmulps
__m512 _mm512_mul_round_ps (__m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_mul_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vmulps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply packed single-precision (32-bit) floating-point elements in
a and
b, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vmulsd
__m128d _mm_mask_mul_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_mul_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point element in
a and
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] * b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vmulsd
__m128d _mm_maskz_mul_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_mul_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point element in
a and
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] * b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vmulsd
__m128d _mm_mul_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mul_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point element in
a and
b, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := a[63:0] * b[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vmulss
__m128 _mm_mask_mul_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_mul_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point element in
a and
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] * b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vmulss
__m128 _mm_maskz_mul_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_mul_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point element in
a and
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] * b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vmulss
__m128 _mm_mul_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mul_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point element in
a and
b, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := a[31:0] * b[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vmulsd
__m128d _mm_mask_mul_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_mul_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] * b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vmulsd
__m128d _mm_maskz_mul_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_mul_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vmulsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] * b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
mulsd
__m128d _mm_mul_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_mul_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: mulsd xmm, xmm
CPUID Flags: SSE2
Description
Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := a[63:0] * b[63:0]
dst[127:64] := a[127:64]
Performance
vmulss
__m128 _mm_mask_mul_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_mul_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] * b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vmulss
__m128 _mm_maskz_mul_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_mul_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vmulss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] * b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
mulss
__m128 _mm_mul_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_mul_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: mulss xmm, xmm
CPUID Flags: SSE
Description
Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := a[31:0] * b[31:0]
dst[127:32] := a[127:32]
Performance
pmuludq
__m64 _mm_mul_su32 (__m64 a, __m64 b)
Synopsis
__m64 _mm_mul_su32 (__m64 a, __m64 b)
#include "emmintrin.h"
Instruction: pmuludq mm, mm
CPUID Flags: SSE2
Description
Multiply the low unsigned 32-bit integers from a and b, and store the unsigned 64-bit result in dst.
Operation
dst[63:0] := a[31:0] * b[31:0]
Performance
vpmulhw
__m128i _mm_mask_mulhi_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_mulhi_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmulhw
__m128i _mm_maskz_mulhi_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_mulhi_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmulhw
__m128i _mm_mulhi_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_mulhi_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmulhw xmm, xmm
CPUID Flags: SSE2
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 7
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ENDFOR
Performance
vpmulhw
__m256i _mm256_mask_mulhi_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_mulhi_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmulhw
__m256i _mm256_maskz_mulhi_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_mulhi_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmulhw
__m256i _mm256_mulhi_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_mulhi_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 15
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:256] := 0
Performance
vpmulhw
__m512i _mm512_mask_mulhi_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mulhi_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmulhw
__m512i _mm512_maskz_mulhi_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_mulhi_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmulhw
__m512i _mm512_mulhi_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mulhi_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhw
CPUID Flags: AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 31
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:512] := 0
vpmulhd
__m512i _mm512_mask_mulhi_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mulhi_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhd zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Performs element-by-element multiplication between packed 32-bit integer elements in a and b and stores the high 32 bits of each result into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmulhd
__m512i _mm512_mulhi_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mulhi_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhd zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Performs element-by-element multiplication between packed 32-bit integer elements in a and b and stores the high 32 bits of each result into dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32
ENDFOR
dst[MAX:512] := 0
vpmulhuw
__m128i _mm_mask_mulhi_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_mulhi_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmulhuw
__m128i _mm_maskz_mulhi_epu16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_mulhi_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := o
FI
ENDFOR
dst[MAX:128] := 0
pmulhuw
__m128i _mm_mulhi_epu16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_mulhi_epu16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmulhuw xmm, xmm
CPUID Flags: SSE2
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 7
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ENDFOR
Performance
vpmulhuw
__m256i _mm256_mask_mulhi_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_mulhi_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmulhuw
__m256i _mm256_maskz_mulhi_epu16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_mulhi_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := o
FI
ENDFOR
dst[MAX:256] := 0
vpmulhuw
__m256i _mm256_mulhi_epu16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_mulhi_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhuw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 15
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:256] := 0
Performance
vpmulhuw
__m512i _mm512_mask_mulhi_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mulhi_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512BW
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmulhuw
__m512i _mm512_maskz_mulhi_epu16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_mulhi_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512BW
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ELSE
dst[i+15:i] := o
FI
ENDFOR
dst[MAX:512] := 0
vpmulhuw
__m512i _mm512_mulhi_epu16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mulhi_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhuw
CPUID Flags: AVX512BW
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 31
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:512] := 0
vpmulhud
__m512i _mm512_mask_mulhi_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mulhi_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhud zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Performs element-by-element multiplication between packed unsigned 32-bit integer elements in a and b and stores the high 32 bits of each result into dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmulhud
__m512i _mm512_mulhi_epu32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mulhi_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhud zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Performs element-by-element multiplication between packed unsigned 32-bit integer elements in a and b and stores the high 32 bits of each result into dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i] * b[i+31:i]) >> 32
ENDFOR
dst[MAX:512] := 0
pmulhuw
__m64 _mm_mulhi_pu16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_mulhi_pu16 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmulhuw mm, mm
CPUID Flags: SSE
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 3
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ENDFOR
vpmulhrsw
__m128i _mm_mask_mulhrs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_mulhrs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmulhrsw
__m128i _mm_maskz_mulhrs_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_mulhrs_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ELSE
dst[i+15:i] := 9
FI
ENDFOR
dst[MAX:128] := 0
pmulhrsw
__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_mulhrs_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: pmulhrsw xmm, xmm
CPUID Flags: SSSE3
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst.
Operation
FOR j := 0 to 7
i := j*16
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ENDFOR
Performance
vpmulhrsw
__m256i _mm256_mask_mulhrs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_mulhrs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmulhrsw
__m256i _mm256_maskz_mulhrs_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_mulhrs_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ELSE
dst[i+15:i] := 9
FI
ENDFOR
dst[MAX:256] := 0
vpmulhrsw
__m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_mulhrs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulhrsw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst.
Operation
FOR j := 0 to 15
i := j*16
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ENDFOR
dst[MAX:256] := 0
Performance
vpmulhrsw
__m512i _mm512_mask_mulhrs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mulhrs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmulhrsw
__m512i _mm512_maskz_mulhrs_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_mulhrs_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ELSE
dst[i+15:i] := 9
FI
ENDFOR
dst[MAX:512] := 0
vpmulhrsw
__m512i _mm512_mulhrs_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mulhrs_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulhrsw
CPUID Flags: AVX512BW
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst.
Operation
FOR j := 0 to 31
i := j*16
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ENDFOR
dst[MAX:512] := 0
pmulhrsw
__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_mulhrs_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: pmulhrsw mm, mm
CPUID Flags: SSSE3
Description
Multiply packed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to dst.
Operation
FOR j := 0 to 3
i := j*16
tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
dst[i+15:i] := tmp[16:1]
ENDFOR
vpmullw
__m128i _mm_mask_mullo_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_mullo_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[15:0]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmullw
__m128i _mm_maskz_mullo_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_mullo_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[15:0]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmullw
__m128i _mm_mullo_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_mullo_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pmullw xmm, xmm
CPUID Flags: SSE2
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 7
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[15:0]
ENDFOR
Performance
vpmullw
__m256i _mm256_mask_mullo_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_mullo_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[15:0]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmullw
__m256i _mm256_maskz_mullo_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_mullo_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512VL + AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[15:0]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmullw
__m256i _mm256_mullo_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_mullo_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 15
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[15:0]
ENDFOR
dst[MAX:256] := 0
Performance
vpmullw
__m512i _mm512_mask_mullo_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mullo_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[15:0]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmullw
__m512i _mm512_maskz_mullo_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_mullo_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[15:0]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmullw
__m512i _mm512_mullo_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mullo_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullw
CPUID Flags: AVX512BW
Description
Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 31
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[15:0]
ENDFOR
dst[MAX:512] := 0
vpmulld
__m128i _mm_mask_mullo_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_mullo_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
tmp[63:0] := a[i+31:i] * b[i+31:i]
dst[i+31:i] := tmp[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmulld
__m128i _mm_maskz_mullo_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_mullo_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
tmp[63:0] := a[i+31:i] * b[i+31:i]
dst[i+31:i] := tmp[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pmulld
__m128i _mm_mullo_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_mullo_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: pmulld xmm, xmm
CPUID Flags: SSE4.1
Description
Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 3
i := j*32
tmp[63:0] := a[i+31:i] * b[i+31:i]
dst[i+31:i] := tmp[31:0]
ENDFOR
Performance
vpmulld
__m256i _mm256_mask_mullo_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_mullo_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
tmp[63:0] := a[i+31:i] * b[i+31:i]
dst[i+31:i] := tmp[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmulld
__m256i _mm256_maskz_mullo_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_mullo_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512VL + AVX512F
Description
Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
tmp[63:0] := a[i+31:i] * b[i+31:i]
dst[i+31:i] := tmp[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmulld
__m256i _mm256_mullo_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_mullo_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmulld ymm, ymm, ymm
CPUID Flags: AVX2
Description
Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 7
i := j*32
tmp[63:0] := a[i+31:i] * b[i+31:i]
dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:256] := 0
Performance
vpmulld
__m512i _mm512_mask_mullo_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mullo_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulld zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
tmp[63:0] := a[i+31:i] * b[i+31:i]
dst[i+31:i] := tmp[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmulld
__m512i _mm512_maskz_mullo_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_mullo_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulld
CPUID Flags: AVX512F
Description
Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
tmp[63:0] := a[i+31:i] * b[i+31:i]
dst[i+31:i] := tmp[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmulld
__m512i _mm512_mullo_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mullo_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmulld zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 15
i := j*32
tmp[63:0] := a[i+31:i] * b[i+31:i]
dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:512] := 0
vpmullq
__m128i _mm_mask_mullo_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_mullo_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ
Description
Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
tmp[127:0] := a[i+63:i] * b[i+63:i]
dst[i+63:i] := tmp[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpmullq
__m128i _mm_maskz_mullo_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_mullo_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ
Description
Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
tmp[127:0] := a[i+63:i] * b[i+63:i]
dst[i+63:i] := tmp[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpmullq
__m128i _mm_mullo_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_mullo_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ
Description
Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 1
i := j*64
tmp[127:0] := a[i+63:i] * b[i+63:i]
dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:128] := 0
vpmullq
__m256i _mm256_mask_mullo_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_mullo_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ
Description
Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
tmp[127:0] := a[i+63:i] * b[i+63:i]
dst[i+63:i] := tmp[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpmullq
__m256i _mm256_maskz_mullo_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_mullo_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ
Description
Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
tmp[127:0] := a[i+63:i] * b[i+63:i]
dst[i+63:i] := tmp[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpmullq
__m256i _mm256_mullo_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_mullo_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512VL + AVX512DQ
Description
Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 3
i := j*64
tmp[127:0] := a[i+63:i] * b[i+63:i]
dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:256] := 0
vpmullq
__m512i _mm512_mask_mullo_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mullo_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512DQ
Description
Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
tmp[127:0] := a[i+63:i] * b[i+63:i]
dst[i+63:i] := tmp[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpmullq
__m512i _mm512_maskz_mullo_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_mullo_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512DQ
Description
Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
tmp[127:0] := a[i+63:i] * b[i+63:i]
dst[i+63:i] := tmp[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpmullq
__m512i _mm512_mullo_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mullo_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmullq
CPUID Flags: AVX512DQ
Description
Multiply the packed 64-bit integers in a and b, producing intermediate 128-bit integers, and store the low 64 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 7
i := j*64
tmp[127:0] := a[i+63:i] * b[i+63:i]
dst[i+63:i] := tmp[63:0]
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_mask_mullox_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_mullox_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_mullox_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_mullox_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] * b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vpmultishiftqb
__m128i _mm_mask_multishift_epi64_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_multishift_epi64_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL
Description
For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR i := 0 to 1
q := i * 64
FOR j := 0 to 7
tmp8 := 0
ctrl := a[q+j*8+7:q+j*8] & 63
FOR l := 0 to 7
tmp8[k] := b[q+((ctrl+k) & 63)]
ENDFOR
IF k[i*8+j]
dst[q+j*8+7:q+j*8] := tmp8[7:0]
ELSE
dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
FI
ENDFOR
ENDFOR
dst[MAX:128] := 0
vpmultishiftqb
__m128i _mm_maskz_multishift_epi64_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_multishift_epi64_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL
Description
For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR i := 0 to 1
q := i * 64
FOR j := 0 to 7
tmp8 := 0
ctrl := a[q+j*8+7:q+j*8] & 63
FOR l := 0 to 7
tmp8[k] := b[q+((ctrl+k) & 63)]
ENDFOR
IF k[i*8+j]
dst[q+j*8+7:q+j*8] := tmp8[7:0]
ELSE
dst[q+j*8+7:q+j*8] := 0
FI
ENDFOR
ENDFOR
dst[MAX:128] := 0
vpmultishiftqb
__m128i _mm_multishift_epi64_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_multishift_epi64_epi8 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL
Description
For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
Operation
FOR i := 0 to 1
q := i * 64
FOR j := 0 to 7
tmp8 := 0
ctrl := a[q+j*8+7:q+j*8] & 63
FOR l := 0 to 7
tmp8[k] := b[q+((ctrl+k) & 63)]
ENDFOR
dst[q+j*8+7:q+j*8] := tmp8[7:0]
ENDFOR
ENDFOR
dst[MAX:128] := 0
vpmultishiftqb
__m256i _mm256_mask_multishift_epi64_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_multishift_epi64_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL
Description
For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR i := 0 to 3
q := i * 64
FOR j := 0 to 7
tmp8 := 0
ctrl := a[q+j*8+7:q+j*8] & 63
FOR l := 0 to 7
tmp8[k] := b[q+((ctrl+k) & 63)]
ENDFOR
IF k[i*8+j]
dst[q+j*8+7:q+j*8] := tmp8[7:0]
ELSE
dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
FI
ENDFOR
ENDFOR
dst[MAX:256] := 0
vpmultishiftqb
__m256i _mm256_maskz_multishift_epi64_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_multishift_epi64_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL
Description
For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR i := 0 to 3
q := i * 64
FOR j := 0 to 7
tmp8 := 0
ctrl := a[q+j*8+7:q+j*8] & 63
FOR l := 0 to 7
tmp8[k] := b[q+((ctrl+k) & 63)]
ENDFOR
IF k[i*8+j]
dst[q+j*8+7:q+j*8] := tmp8[7:0]
ELSE
dst[q+j*8+7:q+j*8] := 0
FI
ENDFOR
ENDFOR
dst[MAX:256] := 0
vpmultishiftqb
__m256i _mm256_multishift_epi64_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_multishift_epi64_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI + AVX512VL
Description
For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
Operation
FOR i := 0 to 3
q := i * 64
FOR j := 0 to 7
tmp8 := 0
ctrl := a[q+j*8+7:q+j*8] & 63
FOR l := 0 to 7
tmp8[k] := b[q+((ctrl+k) & 63)]
ENDFOR
dst[q+j*8+7:q+j*8] := tmp8[7:0]
ENDFOR
ENDFOR
dst[MAX:256] := 0
vpmultishiftqb
__m512i _mm512_mask_multishift_epi64_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_multishift_epi64_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI
Description
For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR i := 0 to 7
q := i * 64
FOR j := 0 to 7
tmp8 := 0
ctrl := a[q+j*8+7:q+j*8] & 63
FOR l := 0 to 7
tmp8[k] := b[q+((ctrl+k) & 63)]
ENDFOR
IF k[i*8+j]
dst[q+j*8+7:q+j*8] := tmp8[7:0]
ELSE
dst[q+j*8+7:q+j*8] := src[q+j*8+7:q+j*8]
FI
ENDFOR
ENDFOR
dst[MAX:512] := 0
vpmultishiftqb
__m512i _mm512_maskz_multishift_epi64_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_multishift_epi64_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI
Description
For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR i := 0 to 7
q := i * 64
FOR j := 0 to 7
tmp8 := 0
ctrl := a[q+j*8+7:q+j*8] & 63
FOR l := 0 to 7
tmp8[k] := b[q+((ctrl+k) & 63)]
ENDFOR
IF k[i*8+j]
dst[q+j*8+7:q+j*8] := tmp8[7:0]
ELSE
dst[q+j*8+7:q+j*8] := 0
FI
ENDFOR
ENDFOR
dst[MAX:512] := 0
vpmultishiftqb
__m512i _mm512_multishift_epi64_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_multishift_epi64_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpmultishiftqb
CPUID Flags: AVX512VBMI
Description
For each 64-bit element in b, select 8 unaligned bytes using a byte-granular shift control within the corresponding 64-bit element of a, and store the 8 assembled bytes to the corresponding 64-bit element of dst.
Operation
FOR i := 0 to 7
q := i * 64
FOR j := 0 to 7
tmp8 := 0
ctrl := a[q+j*8+7:q+j*8] & 63
FOR l := 0 to 7
tmp8[k] := b[q+((ctrl+k) & 63)]
ENDFOR
dst[q+j*8+7:q+j*8] := tmp8[7:0]
ENDFOR
ENDFOR
dst[MAX:512] := 0
mwait
void _mm_mwait (unsigned extensions, unsigned hints)
Synopsis
void _mm_mwait (unsigned extensions, unsigned hints)
#include "pmmintrin.h"
Instruction: mwait
CPUID Flags: MONITOR
Description
Hint to the processor that it can enter an implementation-dependent-optimized state while waiting for an event or store operation to the address range specified by MONITOR.
...
__m512d _mm512_mask_nearbyint_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_nearbyint_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Rounds each packed double-precision (64-bit) floating-point element in a to the nearest integer value and stores the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := NearbyInt(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_nearbyint_pd (__m512d a)
Synopsis
__m512d _mm512_nearbyint_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Rounds each packed double-precision (64-bit) floating-point element in a to the nearest integer value and stores the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := NearbyInt(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_nearbyint_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_nearbyint_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Rounds each packed single-precision (32-bit) floating-point element in a to the nearest integer value and stores the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := NearbyInt(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_nearbyint_ps (__m512 a)
Synopsis
__m512 _mm512_nearbyint_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Rounds each packed single-precision (32-bit) floating-point element in a to the nearest integer value and stores the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := NearbyInt(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpord
__m128i _mm_mask_or_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_or_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpord
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] OR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpord
__m128i _mm_maskz_or_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_or_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpord
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] OR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpord
__m256i _mm256_mask_or_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_or_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpord
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] OR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpord
__m256i _mm256_maskz_or_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_or_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpord
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] OR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpord
__m512i _mm512_mask_or_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_or_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] OR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpord
__m512i _mm512_maskz_or_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_or_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpord zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] OR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpord
__m512i _mm512_or_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_or_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] OR b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vporq
__m128i _mm_mask_or_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_or_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vporq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] OR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vporq
__m128i _mm_maskz_or_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_or_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vporq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] OR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vporq
__m256i _mm256_mask_or_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_or_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vporq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] OR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vporq
__m256i _mm256_maskz_or_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_or_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vporq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] OR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vporq
__m512i _mm512_mask_or_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_or_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vporq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] OR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vporq
__m512i _mm512_maskz_or_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_or_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vporq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] OR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vporq
__m512i _mm512_or_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_or_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vporq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] OR b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vorpd
__m128d _mm_mask_or_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_or_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vorpd
__m128d _mm_maskz_or_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_or_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
orpd
__m128d _mm_or_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_or_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: orpd xmm, xmm
CPUID Flags: SSE2
Description
Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ENDFOR
Performance
vorpd
__m256d _mm256_mask_or_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_or_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vorpd
__m256d _mm256_maskz_or_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_or_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vorpd
__m256d _mm256_or_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_or_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vorpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vorpd
__m512d _mm512_mask_or_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_or_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vorpd
__m512d _mm512_maskz_or_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_or_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vorpd
__m512d _mm512_or_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_or_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vorpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise OR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] BITWISE OR b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vorps
__m128 _mm_mask_or_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_or_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vorps
__m128 _mm_maskz_or_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_or_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
orps
__m128 _mm_or_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_or_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: orps xmm, xmm
CPUID Flags: SSE
Description
Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ENDFOR
Performance
vorps
__m256 _mm256_mask_or_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_or_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vorps
__m256 _mm256_maskz_or_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_or_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vorps
__m256 _mm256_or_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_or_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vorps ymm, ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vorps
__m512 _mm512_mask_or_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_or_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512DQ
Description
Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vorps
__m512 _mm512_maskz_or_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_or_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512DQ
Description
Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vorps
__m512 _mm512_or_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_or_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vorps
CPUID Flags: AVX512DQ
Description
Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ENDFOR
dst[MAX:512] := 0
por
__m128i _mm_or_si128 (__m128i a, __m128i b)
Synopsis
__m128i _mm_or_si128 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: por xmm, xmm
CPUID Flags: SSE2
Description
Compute the bitwise OR of 128 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[127:0] := (a[127:0] OR b[127:0])
Performance
vpor
__m256i _mm256_or_si256 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_or_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpor ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compute the bitwise OR of 256 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[255:0] := (a[255:0] OR b[255:0])
dst[MAX:256] := 0
Performance
vpord
__m512i _mm512_or_si512 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_or_si512 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[511:0] := (a[511:0] OR b[511:0])
dst[MAX:512] := 0
vpacksswb
__m128i _mm_mask_packs_epi16 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_packs_epi16 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpacksswb
__m128i _mm_maskz_packs_epi16 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_packs_epi16 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
packsswb
__m128i _mm_packs_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_packs_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: packsswb xmm, xmm
CPUID Flags: SSE2
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.
Operation
dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
Performance
vpacksswb
__m256i _mm256_mask_packs_epi16 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_packs_epi16 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpacksswb
__m256i _mm256_maskz_packs_epi16 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_packs_epi16 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpacksswb
__m256i _mm256_packs_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_packs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpacksswb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.
Operation
dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
dst[MAX:256] := 0
Performance
vpacksswb
__m512i _mm512_mask_packs_epi16 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_packs_epi16 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpacksswb
__m512i _mm512_maskz_packs_epi16 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_packs_epi16 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpacksswb
__m512i _mm512_packs_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_packs_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpacksswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using signed saturation, and store the results in dst.
Operation
dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])
dst[MAX:512] := 0
vpackssdw
__m128i _mm_mask_packs_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_packs_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpackssdw
__m128i _mm_maskz_packs_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_packs_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
packssdw
__m128i _mm_packs_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_packs_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: packssdw xmm, xmm
CPUID Flags: SSE2
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
Operation
dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
Performance
vpackssdw
__m256i _mm256_mask_packs_epi32 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_packs_epi32 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpackssdw
__m256i _mm256_maskz_packs_epi32 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_packs_epi32 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpackssdw
__m256i _mm256_packs_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_packs_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackssdw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
Operation
dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
dst[MAX:256] := 0
Performance
vpackssdw
__m512i _mm512_mask_packs_epi32 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_packs_epi32 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpackssdw
__m512i _mm512_maskz_packs_epi32 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_packs_epi32 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpackssdw
__m512i _mm512_packs_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_packs_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackssdw
CPUID Flags: AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using signed saturation, and store the results in dst.
Operation
dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])
dst[MAX:512] := 0
vpackstorehd
void _mm512_mask_packstorehi_epi32 (void* mt, __mmask16 k, __m512i v1)
Synopsis
void _mm512_mask_packstorehi_epi32 (void* mt, __mmask16 k, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorehd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed 32-bit integer elements of v1 into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
storeOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 15
IF k[j]
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*4) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*32
MEM[addr + storeOffset*4] := v1[i+31:i]
FI
storeOffset := storeOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vpackstorehd
void _mm512_packstorehi_epi32 (void* mt, __m512i v1)
Synopsis
void _mm512_packstorehi_epi32 (void* mt, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorehd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed 32-bit integer elements of v1 into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elements of the stream that map at or after the first 64-byte-aligned address following (m5-64)).
Operation
storeOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 15
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*4) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*32
MEM[addr + storeOffset*4] := v1[i+31:i]
FI
storeOffset := storeOffset + 1
ENDFOR
dst[MAX:512] := 0
vpackstorehq
void _mm512_mask_packstorehi_epi64 (void* mt, __mmask8 k, __m512i v1)
Synopsis
void _mm512_mask_packstorehi_epi64 (void* mt, __mmask8 k, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorehq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed 64-bit integer elements of v1 into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
storeOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 7
IF k[j]
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*8) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*64
MEM[addr + storeOffset*8] := v1[i+63:i]
FI
storeOffset := storeOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vpackstorehq
void _mm512_packstorehi_epi64 (void* mt, __m512i v1)
Synopsis
void _mm512_packstorehi_epi64 (void* mt, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorehq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed 64-bit integer elements of v1 into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).
Operation
storeOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 7
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*8) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*64
MEM[addr + storeOffset*8] := v1[i+63:i]
FI
storeOffset := storeOffset + 1
ENDFOR
dst[MAX:512] := 0
vpackstorehpd
void _mm512_mask_packstorehi_pd (void* mt, __mmask8 k, __m512d v1)
Synopsis
void _mm512_mask_packstorehi_pd (void* mt, __mmask8 k, __m512d v1)
#include "immintrin.h"
Instruction: vpackstorehpd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
storeOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 7
IF k[j]
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*8) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*64
MEM[addr + storeOffset*4] := v1[i+63:i]
FI
storeOffset := storeOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vpackstorehpd
void _mm512_packstorehi_pd (void* mt, __m512d v1)
Synopsis
void _mm512_packstorehi_pd (void* mt, __m512d v1)
#include "immintrin.h"
Instruction: vpackstorehpd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).
Operation
storeOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 7
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*8) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*64
MEM[addr + storeOffset*4] := v1[i+63:i]
FI
storeOffset := storeOffset + 1
ENDFOR
dst[MAX:512] := 0
vpackstorehps
void _mm512_mask_packstorehi_ps (void* mt, __mmask16 k, __m512 v1)
Synopsis
void _mm512_mask_packstorehi_ps (void* mt, __mmask16 k, __m512 v1)
#include "immintrin.h"
Instruction: vpackstorehps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed single-precision (32-bit) floating-point elements of v1 into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
storeOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 15
IF k[j]
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*4) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*32
MEM[addr + storeOffset*4] := v1[i+31:i]
FI
storeOffset := storeOffset + 1
FI
ENDFOR
dst[MAX:512] := 0
vpackstorehps
void _mm512_packstorehi_ps (void* mt, __m512 v1)
Synopsis
void _mm512_packstorehi_ps (void* mt, __m512 v1)
#include "immintrin.h"
Instruction: vpackstorehps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed single-precision (32-bit) floating-point elements of v1 into a doubleword stream at a logically mapped starting address (mt-64), storing the high-64-byte elements of that stream (those elemetns of the stream that map at or after the first 64-byte-aligned address following (m5-64)).
Operation
storeOffset := 0
foundNext64BytesBoundary := false
addr = mt-64
FOR j := 0 to 15
IF foundNext64BytesBoundary == false
IF ((addr + (storeOffset + 1)*4) % 64) == 0
foundNext64BytesBoundary = true
FI
ELSE
i := j*32
MEM[addr + storeOffset*4] := v1[i+31:i]
FI
storeOffset := storeOffset + 1
ENDFOR
dst[MAX:512] := 0
vpackstoreld
void _mm512_mask_packstorelo_epi32 (void* mt, __mmask16 k, __m512i v1)
Synopsis
void _mm512_mask_packstorelo_epi32 (void* mt, __mmask16 k, __m512i v1)
#include "immintrin.h"
Instruction: vpackstoreld m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed 32-bit integer elements of v1 into a doubleword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
storeOffset := 0
addr = mt
FOR j := 0 to 15
IF k[j]
i := j*32
MEM[addr + storeOffset*4] := v1[i+31:i]
storeOffset := storeOffset + 1
IF ((addr + storeOffset*4) % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
vpackstoreld
void _mm512_packstorelo_epi32 (void* mt, __m512i v1)
Synopsis
void _mm512_packstorelo_epi32 (void* mt, __m512i v1)
#include "immintrin.h"
Instruction: vpackstoreld m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed 32-bit integer elements of v1 into a doubleword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt).
Operation
storeOffset := 0
addr = mt
FOR j := 0 to 15
i := j*32
MEM[addr + storeOffset*4] := v1[i+31:i]
storeOffset := storeOffset + 1
IF ((addr + storeOffset*4) % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelq
void _mm512_mask_packstorelo_epi64 (void* mt, __mmask8 k, __m512i v1)
Synopsis
void _mm512_mask_packstorelo_epi64 (void* mt, __mmask8 k, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorelq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed 64-bit integer elements of v1 into a quadword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
storeOffset := 0
addr = mt
FOR j := 0 to 7
IF k[j]
i := j*64
MEM[addr + storeOffset*8] := v1[i+63:i]
storeOffset := storeOffset + 1
IF ((addr + storeOffset*8) % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelq
void _mm512_packstorelo_epi64 (void* mt, __m512i v1)
Synopsis
void _mm512_packstorelo_epi64 (void* mt, __m512i v1)
#include "immintrin.h"
Instruction: vpackstorelq m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed 64-bit integer elements of v1 into a quadword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt).
Operation
storeOffset := 0
addr = mt
FOR j := 0 to 7
i := j*64
MEM[addr + storeOffset*8] := v1[i+63:i]
storeOffset := storeOffset + 1
IF ((addr + storeOffset*8) % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelpd
void _mm512_mask_packstorelo_pd (void* mt, __mmask8 k, __m512d v1)
Synopsis
void _mm512_mask_packstorelo_pd (void* mt, __mmask8 k, __m512d v1)
#include "immintrin.h"
Instruction: vpackstorelpd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
storeOffset := 0
addr = mt
FOR j := 0 to 7
IF k[j]
i := j*64
MEM[addr + storeOffset*8] := v1[i+63:i]
storeOffset := storeOffset + 1
IF ((addr + storeOffset*8) % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelpd
void _mm512_packstorelo_pd (void* mt, __m512d v1)
Synopsis
void _mm512_packstorelo_pd (void* mt, __m512d v1)
#include "immintrin.h"
Instruction: vpackstorelpd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed double-precision (64-bit) floating-point elements of v1 into a quadword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt).
Operation
storeOffset := 0
addr = mt
FOR j := 0 to 7
i := j*64
MEM[addr + storeOffset*8] := v1[i+63:i]
storeOffset := storeOffset + 1
IF ((addr + storeOffset*8) % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelps
void _mm512_mask_packstorelo_ps (void* mt, __mmask16 k, __m512 v1)
Synopsis
void _mm512_mask_packstorelo_ps (void* mt, __mmask16 k, __m512 v1)
#include "immintrin.h"
Instruction: vpackstorelps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed single-precision (32-bit) floating-point elements of v1 into a doubleword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt). Elements are loaded from memory according to element selector k (elements are skipped when the corresponding mask bit is not set).
Operation
storeOffset := 0
addr = mt
FOR j := 0 to 15
IF k[j]
i := j*32
MEM[addr + storeOffset*4] := v1[i+31:i]
storeOffset := storeOffset + 1
IF ((addr + storeOffset*4) % 64) == 0
BREAK
FI
FI
ENDFOR
dst[MAX:512] := 0
vpackstorelps
void _mm512_packstorelo_ps (void* mt, __m512 v1)
Synopsis
void _mm512_packstorelo_ps (void* mt, __m512 v1)
#include "immintrin.h"
Instruction: vpackstorelps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed single-precision (32-bit) floating-point elements of v1 into a doubleword stream at a logically mapped starting address mt, storing the low-64-byte elements of that stream (those elements of the stream that map before the first 64-byte-aligned address follwing mt).
Operation
storeOffset := 0
addr = mt
FOR j := 0 to 15
i := j*32
MEM[addr + storeOffset*4] := v1[i+31:i]
storeOffset := storeOffset + 1
IF ((addr + storeOffset*4) % 64) == 0
BREAK
FI
ENDFOR
dst[MAX:512] := 0
vpackuswb
__m128i _mm_mask_packus_epi16 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_packus_epi16 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpackuswb
__m128i _mm_maskz_packus_epi16 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_packus_epi16 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
packuswb
__m128i _mm_packus_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_packus_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: packuswb xmm, xmm
CPUID Flags: SSE2
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.
Operation
dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
Performance
vpackuswb
__m256i _mm256_mask_packus_epi16 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_packus_epi16 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpackuswb
__m256i _mm256_maskz_packus_epi16 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_packus_epi16 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpackuswb
__m256i _mm256_packus_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_packus_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackuswb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.
Operation
dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
dst[MAX:256] := 0
Performance
vpackuswb
__m512i _mm512_mask_packus_epi16 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_packus_epi16 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpackuswb
__m512i _mm512_maskz_packus_epi16 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_packus_epi16 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpackuswb
__m512i _mm512_packus_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_packus_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackuswb
CPUID Flags: AVX512BW
Description
Convert packed 16-bit integers from a and b to packed 8-bit integers using unsigned saturation, and store the results in dst.
Operation
dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])
dst[MAX:512] := 0
vpackusdw
__m128i _mm_mask_packus_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_packus_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpackusdw
__m128i _mm_maskz_packus_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_packus_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
packusdw
__m128i _mm_packus_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_packus_epi32 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: packusdw xmm, xmm
CPUID Flags: SSE4.1
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
Operation
dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
Performance
vpackusdw
__m256i _mm256_mask_packus_epi32 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_packus_epi32 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpackusdw
__m256i _mm256_maskz_packus_epi32 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_packus_epi32 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512VL + AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpackusdw
__m256i _mm256_packus_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_packus_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpackusdw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
Operation
dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
dst[MAX:256] := 0
Performance
vpackusdw
__m512i _mm512_mask_packus_epi32 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_packus_epi32 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpackusdw
__m512i _mm512_maskz_packus_epi32 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_packus_epi32 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpackusdw
__m512i _mm512_packus_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_packus_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpackusdw
CPUID Flags: AVX512BW
Description
Convert packed 32-bit integers from a and b to packed 16-bit integers using unsigned saturation, and store the results in dst.
Operation
dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])
dst[MAX:512] := 0
pause
void _mm_pause (void)
Synopsis
void _mm_pause (void)
#include "emmintrin.h"
Instruction: pause
CPUID Flags: SSE2
Description
Provide a hint to the processor that the code sequence is a spin-wait loop. This can help improve the performance and power consumption of spin-wait loops.
Performance
pavgb
__m64 _m_pavgb (__m64 a, __m64 b)
Synopsis
__m64 _m_pavgb (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pavgb mm, mm
CPUID Flags: SSE
Description
Average packed unsigned 8-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*8
dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR
pavgw
__m64 _m_pavgw (__m64 a, __m64 b)
Synopsis
__m64 _m_pavgw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pavgw mm, mm
CPUID Flags: SSE
Description
Average packed unsigned 16-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*16
dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR
pdep
unsigned int _pdep_u32 (unsigned int a, unsigned int mask)
Synopsis
unsigned int _pdep_u32 (unsigned int a, unsigned int mask)
#include "immintrin.h"
Instruction: pdep r32, r32, r32
CPUID Flags: BMI2
Description
Deposit contiguous low bits from unsigned 32-bit integer a to dst at the corresponding bit locations specified by mask; all other bits in dst are set to zero.
Operation
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m < 32
IF mask[m] = 1
dst[m] := tmp[k]
k := k + 1
FI
m := m + 1
OD
Performance
pdep
unsigned __int64 _pdep_u64 (unsigned __int64 a, unsigned __int64 mask)
Synopsis
unsigned __int64 _pdep_u64 (unsigned __int64 a, unsigned __int64 mask)
#include "immintrin.h"
Instruction: pdep r64, r64, r64
CPUID Flags: BMI2
Description
Deposit contiguous low bits from unsigned 64-bit integer a to dst at the corresponding bit locations specified by mask; all other bits in dst are set to zero.
Operation
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m < 64
IF mask[m] = 1
dst[m] := tmp[k]
k := k + 1
FI
m := m + 1
OD
Performance
vpermilpd
__m128d _mm_mask_permute_pd (__m128d src, __mmask8 k, __m128d a, const int imm8)
Synopsis
__m128d _mm_mask_permute_pd (__m128d src, __mmask8 k, __m128d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermilpd
__m128d _mm_maskz_permute_pd (__mmask8 k, __m128d a, const int imm8)
Synopsis
__m128d _mm_maskz_permute_pd (__mmask8 k, __m128d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermilpd
__m128d _mm_permute_pd (__m128d a, int imm8)
Synopsis
__m128d _mm_permute_pd (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vpermilpd xmm, xmm, imm
CPUID Flags: AVX
Description
Shuffle double-precision (64-bit) floating-point elements in a using the control in imm8, and store the results in dst.
Operation
IF (imm8[0] == 0) dst[63:0] := a[63:0]
IF (imm8[0] == 1) dst[63:0] := a[127:64]
IF (imm8[1] == 0) dst[127:64] := a[63:0]
IF (imm8[1] == 1) dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
vpermilpd
__m256d _mm256_mask_permute_pd (__m256d src, __mmask8 k, __m256d a, const int imm8)
Synopsis
__m256d _mm256_mask_permute_pd (__m256d src, __mmask8 k, __m256d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermilpd
__m256d _mm256_maskz_permute_pd (__mmask8 k, __m256d a, const int imm8)
Synopsis
__m256d _mm256_maskz_permute_pd (__mmask8 k, __m256d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermilpd
__m256d _mm256_permute_pd (__m256d a, int imm8)
Synopsis
__m256d _mm256_permute_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vpermilpd ymm, ymm, imm
CPUID Flags: AVX
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
IF (imm8[0] == 0) dst[63:0] := a[63:0]
IF (imm8[0] == 1) dst[63:0] := a[127:64]
IF (imm8[1] == 0) dst[127:64] := a[63:0]
IF (imm8[1] == 1) dst[127:64] := a[127:64]
IF (imm8[2] == 0) dst[191:128] := a[191:128]
IF (imm8[2] == 1) dst[191:128] := a[255:192]
IF (imm8[3] == 0) dst[255:192] := a[191:128]
IF (imm8[3] == 1) dst[255:192] := a[255:192]
dst[MAX:256] := 0
Performance
vpermilpd
__m512d _mm512_mask_permute_pd (__m512d src, __mmask8 k, __m512d a, const int imm8)
Synopsis
__m512d _mm512_mask_permute_pd (__m512d src, __mmask8 k, __m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]
IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]
IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]
IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]
IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]
IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]
IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]
IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermilpd
__m512d _mm512_maskz_permute_pd (__mmask8 k, __m512d a, const int imm8)
Synopsis
__m512d _mm512_maskz_permute_pd (__mmask8 k, __m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
IF (imm8[0] == 0) tmp_dst[63:0] := a[63:0]
IF (imm8[0] == 1) tmp_dst[63:0] := a[127:64]
IF (imm8[1] == 0) tmp_dst[127:64] := a[63:0]
IF (imm8[1] == 1) tmp_dst[127:64] := a[127:64]
IF (imm8[2] == 0) tmp_dst[191:128] := a[191:128]
IF (imm8[2] == 1) tmp_dst[191:128] := a[255:192]
IF (imm8[3] == 0) tmp_dst[255:192] := a[191:128]
IF (imm8[3] == 1) tmp_dst[255:192] := a[255:192]
IF (imm8[4] == 0) tmp_dst[319:256] := a[319:256]
IF (imm8[4] == 1) tmp_dst[319:256] := a[383:320]
IF (imm8[5] == 0) tmp_dst[383:320] := a[319:256]
IF (imm8[5] == 1) tmp_dst[383:320] := a[383:320]
IF (imm8[6] == 0) tmp_dst[447:384] := a[447:384]
IF (imm8[6] == 1) tmp_dst[447:384] := a[511:448]
IF (imm8[7] == 0) tmp_dst[511:448] := a[447:384]
IF (imm8[7] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermilpd
__m512d _mm512_permute_pd (__m512d a, const int imm8)
Synopsis
__m512d _mm512_permute_pd (__m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
IF (imm8[0] == 0) dst[63:0] := a[63:0]
IF (imm8[0] == 1) dst[63:0] := a[127:64]
IF (imm8[1] == 0) dst[127:64] := a[63:0]
IF (imm8[1] == 1) dst[127:64] := a[127:64]
IF (imm8[2] == 0) dst[191:128] := a[191:128]
IF (imm8[2] == 1) dst[191:128] := a[255:192]
IF (imm8[3] == 0) dst[255:192] := a[191:128]
IF (imm8[3] == 1) dst[255:192] := a[255:192]
IF (imm8[4] == 0) dst[319:256] := a[319:256]
IF (imm8[4] == 1) dst[319:256] := a[383:320]
IF (imm8[5] == 0) dst[383:320] := a[319:256]
IF (imm8[5] == 1) dst[383:320] := a[383:320]
IF (imm8[6] == 0) dst[447:384] := a[447:384]
IF (imm8[6] == 1) dst[447:384] := a[511:448]
IF (imm8[7] == 0) dst[511:448] := a[447:384]
IF (imm8[7] == 1) dst[511:448] := a[511:448]
dst[MAX:512] := 0
vpermilps
__m128 _mm_mask_permute_ps (__m128 src, __mmask8 k, __m128 a, const int imm8)
Synopsis
__m128 _mm_mask_permute_ps (__m128 src, __mmask8 k, __m128 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermilps
__m128 _mm_maskz_permute_ps (__mmask8 k, __m128 a, const int imm8)
Synopsis
__m128 _mm_maskz_permute_ps (__mmask8 k, __m128 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermilps
__m128 _mm_permute_ps (__m128 a, int imm8)
Synopsis
__m128 _mm_permute_ps (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vpermilps xmm, xmm, imm
CPUID Flags: AVX
Description
Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[MAX:128] := 0
Performance
vpermilps
__m256 _mm256_mask_permute_ps (__m256 src, __mmask8 k, __m256 a, const int imm8)
Synopsis
__m256 _mm256_mask_permute_ps (__m256 src, __mmask8 k, __m256 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermilps
__m256 _mm256_maskz_permute_ps (__mmask8 k, __m256 a, const int imm8)
Synopsis
__m256 _mm256_maskz_permute_ps (__mmask8 k, __m256 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermilps
__m256 _mm256_permute_ps (__m256 a, int imm8)
Synopsis
__m256 _mm256_permute_ps (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vpermilps ymm, ymm, imm
CPUID Flags: AVX
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[MAX:256] := 0
Performance
vpermilps
__m512 _mm512_mask_permute_ps (__m512 src, __mmask16 k, __m512 a, const int imm8)
Synopsis
__m512 _mm512_mask_permute_ps (__m512 src, __mmask16 k, __m512 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermilps
__m512 _mm512_maskz_permute_ps (__mmask16 k, __m512 a, const int imm8)
Synopsis
__m512 _mm512_maskz_permute_ps (__mmask16 k, __m512 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermilps
__m512 _mm512_permute_ps (__m512 a, const int imm8)
Synopsis
__m512 _mm512_permute_ps (__m512 a, const int imm8)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[287:256] := SELECT4(a[383:256], imm8[1:0])
dst[319:288] := SELECT4(a[383:256], imm8[3:2])
dst[351:320] := SELECT4(a[383:256], imm8[5:4])
dst[383:352] := SELECT4(a[383:256], imm8[7:6])
dst[415:384] := SELECT4(a[511:384], imm8[1:0])
dst[447:416] := SELECT4(a[511:384], imm8[3:2])
dst[479:448] := SELECT4(a[511:384], imm8[5:4])
dst[511:480] := SELECT4(a[511:384], imm8[7:6])
dst[MAX:512] := 0
vperm2f128
__m256d _mm256_permute2f128_pd (__m256d a, __m256d b, int imm8)
Synopsis
__m256d _mm256_permute2f128_pd (__m256d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vperm2f128 ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Shuffle 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT4(src1, src2, control){
CASE(control[1:0])
0: tmp[127:0] := src1[127:0]
1: tmp[127:0] := src1[255:128]
2: tmp[127:0] := src2[127:0]
3: tmp[127:0] := src2[255:128]
ESAC
IF control[3]
tmp[127:0] := 0
FI
RETURN tmp[127:0]
}
dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0
Performance
vperm2f128
__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
Synopsis
__m256 _mm256_permute2f128_ps (__m256 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vperm2f128 ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Shuffle 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT4(src1, src2, control){
CASE(control[1:0])
0: tmp[127:0] := src1[127:0]
1: tmp[127:0] := src1[255:128]
2: tmp[127:0] := src2[127:0]
3: tmp[127:0] := src2[255:128]
ESAC
IF control[3]
tmp[127:0] := 0
FI
RETURN tmp[127:0]
}
dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0
Performance
vperm2f128
__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
Synopsis
__m256i _mm256_permute2f128_si256 (__m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vperm2f128 ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Shuffle 128-bits (composed of integer data) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT4(src1, src2, control){
CASE(control[1:0])
0: tmp[127:0] := src1[127:0]
1: tmp[127:0] := src1[255:128]
2: tmp[127:0] := src2[127:0]
3: tmp[127:0] := src2[255:128]
ESAC
IF control[3]
tmp[127:0] := 0
FI
RETURN tmp[127:0]
}
dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0
Performance
vperm2i128
__m256i _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_permute2x128_si256 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vperm2i128 ymm, ymm, ymm, imm
CPUID Flags: AVX2
Description
Shuffle 128-bits (composed of integer data) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT4(src1, src2, control){
CASE(control[1:0])
0: tmp[127:0] := src1[127:0]
1: tmp[127:0] := src1[255:128]
2: tmp[127:0] := src2[127:0]
3: tmp[127:0] := src2[255:128]
ESAC
IF control[3]
tmp[127:0] := 0
FI
RETURN tmp[127:0]
}
dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0
Performance
vpermf32x4
__m512i _mm512_mask_permute4f128_epi32 (__m512i src, __mmask16 k, __m512i a, _MM_PERM_ENUM imm8)
Synopsis
__m512i _mm512_mask_permute4f128_epi32 (__m512i src, __mmask16 k, __m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpermf32x4 zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Permutes 128-bit blocks of the packed 32-bit integer vector a using constant imm8. The results are stored in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control) {
CASE control[1:0] OF
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp[511:0] := 0
FOR j := 0 to 4
i := j*128
n := j*2
tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
ENDFOR
FOR j := 0 to 15
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermf32x4
__m512i _mm512_permute4f128_epi32 (__m512i a, _MM_PERM_ENUM imm8)
Synopsis
__m512i _mm512_permute4f128_epi32 (__m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpermf32x4 zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Permutes 128-bit blocks of the packed 32-bit integer vector a using constant imm8. The results are stored in dst.
Operation
SELECT4(src, control) {
CASE control[1:0] OF
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
FOR j := 0 to 3
i := j*128
n := j*2
dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
ENDFOR
dst[MAX:512] := 0
vpermf32x4
__m512 _mm512_mask_permute4f128_ps (__m512 src, __mmask16 k, __m512 a, _MM_PERM_ENUM imm8)
Synopsis
__m512 _mm512_mask_permute4f128_ps (__m512 src, __mmask16 k, __m512 a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpermf32x4 zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in a using constant imm8. The results are stored in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control) {
CASE control[1:0] OF
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp[511:0] := 0
FOR j := 0 to 4
i := j*128
n := j*2
tmp[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
ENDFOR
FOR j := 0 to 15
IF k[j]
dst[i+31:i] := tmp[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermf32x4
__m512 _mm512_permute4f128_ps (__m512 a, _MM_PERM_ENUM imm8)
Synopsis
__m512 _mm512_permute4f128_ps (__m512 a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpermf32x4 zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Permutes 128-bit blocks of the packed single-precision (32-bit) floating-point elements in a using constant imm8. The results are stored in dst.
Operation
SELECT4(src, control) {
CASE control[1:0] OF
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
FOR j := 0 to 3
i := j*128
n := j*2
dst[i+127:i] := SELECT4(a[511:0], imm8[n+1:n])
ENDFOR
dst[MAX:512] := 0
vpermq
__m256i _mm256_permute4x64_epi64 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_permute4x64_epi64 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq ymm, ymm, imm
CPUID Flags: AVX2
Description
Shuffle 64-bit integers in a across lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0
Performance
vpermpd
__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
Synopsis
__m256d _mm256_permute4x64_pd (__m256d a, const int imm8)
#include "immintrin.h"
Instruction: vpermpd ymm, ymm, imm
CPUID Flags: AVX2
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0
Performance
vpermd
__m512i _mm512_mask_permutevar_epi32 (__m512i src, __mmask16 k, __m512i idx, __m512i a)
Synopsis
__m512i _mm512_mask_permutevar_epi32 (__m512i src, __mmask16 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name.
Operation
FOR j := 0 to 15
i := j*32
id := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := a[id+31:id]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermd
__m512i _mm512_permutevar_epi32 (__m512i idx, __m512i a)
Synopsis
__m512i _mm512_permutevar_epi32 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.
Operation
FOR j := 0 to 15
i := j*32
id := idx[i+3:i]*32
dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:512] := 0
vpermilpd
__m128d _mm_mask_permutevar_pd (__m128d src, __mmask8 k, __m128d a, __m128i b)
Synopsis
__m128d _mm_mask_permutevar_pd (__m128d src, __mmask8 k, __m128d a, __m128i b)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermilpd
__m128d _mm_maskz_permutevar_pd (__mmask8 k, __m128d a, __m128i b)
Synopsis
__m128d _mm_maskz_permutevar_pd (__mmask8 k, __m128d a, __m128i b)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermilpd
__m128d _mm_permutevar_pd (__m128d a, __m128i b)
Synopsis
__m128d _mm_permutevar_pd (__m128d a, __m128i b)
#include "immintrin.h"
Instruction: vpermilpd xmm, xmm, xmm
CPUID Flags: AVX
Description
Shuffle double-precision (64-bit) floating-point elements in a using the control in b, and store the results in dst.
Operation
IF (b[1] == 0) dst[63:0] := a[63:0]
IF (b[1] == 1) dst[63:0] := a[127:64]
IF (b[65] == 0) dst[127:64] := a[63:0]
IF (b[65] == 1) dst[127:64] := a[127:64]
dst[MAX:128] := 0
Performance
vpermilpd
__m256d _mm256_mask_permutevar_pd (__m256d src, __mmask8 k, __m256d a, __m256i b)
Synopsis
__m256d _mm256_mask_permutevar_pd (__m256d src, __mmask8 k, __m256d a, __m256i b)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermilpd
__m256d _mm256_maskz_permutevar_pd (__mmask8 k, __m256d a, __m256i b)
Synopsis
__m256d _mm256_maskz_permutevar_pd (__mmask8 k, __m256d a, __m256i b)
#include "immintrin.h"
Instruction: vpermilpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermilpd
__m256d _mm256_permutevar_pd (__m256d a, __m256i b)
Synopsis
__m256d _mm256_permutevar_pd (__m256d a, __m256i b)
#include "immintrin.h"
Instruction: vpermilpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
Operation
IF (b[1] == 0) dst[63:0] := a[63:0]
IF (b[1] == 1) dst[63:0] := a[127:64]
IF (b[65] == 0) dst[127:64] := a[63:0]
IF (b[65] == 1) dst[127:64] := a[127:64]
IF (b[129] == 0) dst[191:128] := a[191:128]
IF (b[129] == 1) dst[191:128] := a[255:192]
IF (b[193] == 0) dst[255:192] := a[191:128]
IF (b[193] == 1) dst[255:192] := a[255:192]
dst[MAX:256] := 0
Performance
vpermilpd
__m512d _mm512_mask_permutevar_pd (__m512d src, __mmask8 k, __m512d a, __m512i b)
Synopsis
__m512d _mm512_mask_permutevar_pd (__m512d src, __mmask8 k, __m512d a, __m512i b)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
IF (b[257] == 0) tmp_dst[319:256] := a[319:256]
IF (b[257] == 1) tmp_dst[319:256] := a[383:320]
IF (b[321] == 0) tmp_dst[383:320] := a[319:256]
IF (b[321] == 1) tmp_dst[383:320] := a[383:320]
IF (b[385] == 0) tmp_dst[447:384] := a[447:384]
IF (b[385] == 1) tmp_dst[447:384] := a[511:448]
IF (b[449] == 0) tmp_dst[511:448] := a[447:384]
IF (b[449] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermilpd
__m512d _mm512_maskz_permutevar_pd (__mmask8 k, __m512d a, __m512i b)
Synopsis
__m512d _mm512_maskz_permutevar_pd (__mmask8 k, __m512d a, __m512i b)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
IF (b[1] == 0) tmp_dst[63:0] := a[63:0]
IF (b[1] == 1) tmp_dst[63:0] := a[127:64]
IF (b[65] == 0) tmp_dst[127:64] := a[63:0]
IF (b[65] == 1) tmp_dst[127:64] := a[127:64]
IF (b[129] == 0) tmp_dst[191:128] := a[191:128]
IF (b[129] == 1) tmp_dst[191:128] := a[255:192]
IF (b[193] == 0) tmp_dst[255:192] := a[191:128]
IF (b[193] == 1) tmp_dst[255:192] := a[255:192]
IF (b[257] == 0) tmp_dst[319:256] := a[319:256]
IF (b[257] == 1) tmp_dst[319:256] := a[383:320]
IF (b[321] == 0) tmp_dst[383:320] := a[319:256]
IF (b[321] == 1) tmp_dst[383:320] := a[383:320]
IF (b[385] == 0) tmp_dst[447:384] := a[447:384]
IF (b[385] == 1) tmp_dst[447:384] := a[511:448]
IF (b[449] == 0) tmp_dst[511:448] := a[447:384]
IF (b[449] == 1) tmp_dst[511:448] := a[511:448]
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermilpd
__m512d _mm512_permutevar_pd (__m512d a, __m512i b)
Synopsis
__m512d _mm512_permutevar_pd (__m512d a, __m512i b)
#include "immintrin.h"
Instruction: vpermilpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
Operation
IF (b[1] == 0) dst[63:0] := a[63:0]
IF (b[1] == 1) dst[63:0] := a[127:64]
IF (b[65] == 0) dst[127:64] := a[63:0]
IF (b[65] == 1) dst[127:64] := a[127:64]
IF (b[129] == 0) dst[191:128] := a[191:128]
IF (b[129] == 1) dst[191:128] := a[255:192]
IF (b[193] == 0) dst[255:192] := a[191:128]
IF (b[193] == 1) dst[255:192] := a[255:192]
IF (b[257] == 0) dst[319:256] := a[319:256]
IF (b[257] == 1) dst[319:256] := a[383:320]
IF (b[321] == 0) dst[383:320] := a[319:256]
IF (b[321] == 1) dst[383:320] := a[383:320]
IF (b[385] == 0) dst[447:384] := a[447:384]
IF (b[385] == 1) dst[447:384] := a[511:448]
IF (b[449] == 0) dst[511:448] := a[447:384]
IF (b[449] == 1) dst[511:448] := a[511:448]
dst[MAX:512] := 0
vpermilps
__m128 _mm_mask_permutevar_ps (__m128 src, __mmask8 k, __m128 a, __m128i b)
Synopsis
__m128 _mm_mask_permutevar_ps (__m128 src, __mmask8 k, __m128 a, __m128i b)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermilps
__m128 _mm_maskz_permutevar_ps (__mmask8 k, __m128 a, __m128i b)
Synopsis
__m128 _mm_maskz_permutevar_ps (__mmask8 k, __m128 a, __m128i b)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermilps
__m128 _mm_permutevar_ps (__m128 a, __m128i b)
Synopsis
__m128 _mm_permutevar_ps (__m128 a, __m128i b)
#include "immintrin.h"
Instruction: vpermilps xmm, xmm, xmm
CPUID Flags: AVX
Description
Shuffle single-precision (32-bit) floating-point elements in a using the control in b, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], b[1:0])
dst[63:32] := SELECT4(a[127:0], b[33:32])
dst[95:64] := SELECT4(a[127:0], b[65:64])
dst[127:96] := SELECT4(a[127:0], b[97:96])
dst[MAX:128] := 0
Performance
vpermilps
__m256 _mm256_mask_permutevar_ps (__m256 src, __mmask8 k, __m256 a, __m256i b)
Synopsis
__m256 _mm256_mask_permutevar_ps (__m256 src, __mmask8 k, __m256 a, __m256i b)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermilps
__m256 _mm256_maskz_permutevar_ps (__mmask8 k, __m256 a, __m256i b)
Synopsis
__m256 _mm256_maskz_permutevar_ps (__mmask8 k, __m256 a, __m256i b)
#include "immintrin.h"
Instruction: vpermilps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermilps
__m256 _mm256_permutevar_ps (__m256 a, __m256i b)
Synopsis
__m256 _mm256_permutevar_ps (__m256 a, __m256i b)
#include "immintrin.h"
Instruction: vpermilps ymm, ymm, ymm
CPUID Flags: AVX
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], b[1:0])
dst[63:32] := SELECT4(a[127:0], b[33:32])
dst[95:64] := SELECT4(a[127:0], b[65:64])
dst[127:96] := SELECT4(a[127:0], b[97:96])
dst[159:128] := SELECT4(a[255:128], b[129:128])
dst[191:160] := SELECT4(a[255:128], b[161:160])
dst[223:192] := SELECT4(a[255:128], b[193:192])
dst[255:224] := SELECT4(a[255:128], b[225:224])
dst[MAX:256] := 0
Performance
vpermilps
__m512 _mm512_mask_permutevar_ps (__m512 src, __mmask16 k, __m512 a, __m512i b)
Synopsis
__m512 _mm512_mask_permutevar_ps (__m512 src, __mmask16 k, __m512 a, __m512i b)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermilps
__m512 _mm512_maskz_permutevar_ps (__mmask16 k, __m512 a, __m512i b)
Synopsis
__m512 _mm512_maskz_permutevar_ps (__mmask16 k, __m512 a, __m512i b)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], b[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], b[33:32])
tmp_dst[95:64] := SELECT4(a[127:0], b[65:64])
tmp_dst[127:96] := SELECT4(a[127:0], b[97:96])
tmp_dst[159:128] := SELECT4(a[255:128], b[129:128])
tmp_dst[191:160] := SELECT4(a[255:128], b[161:160])
tmp_dst[223:192] := SELECT4(a[255:128], b[193:192])
tmp_dst[255:224] := SELECT4(a[255:128], b[225:224])
tmp_dst[287:256] := SELECT4(a[383:256], b[257:256])
tmp_dst[319:288] := SELECT4(a[383:256], b[289:288])
tmp_dst[351:320] := SELECT4(a[383:256], b[321:320])
tmp_dst[383:352] := SELECT4(a[383:256], b[353:352])
tmp_dst[415:384] := SELECT4(a[511:384], b[385:384])
tmp_dst[447:416] := SELECT4(a[511:384], b[417:416])
tmp_dst[479:448] := SELECT4(a[511:384], b[449:448])
tmp_dst[511:480] := SELECT4(a[511:384], b[481:480])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermilps
__m512 _mm512_permutevar_ps (__m512 a, __m512i b)
Synopsis
__m512 _mm512_permutevar_ps (__m512 a, __m512i b)
#include "immintrin.h"
Instruction: vpermilps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], b[1:0])
dst[63:32] := SELECT4(a[127:0], b[33:32])
dst[95:64] := SELECT4(a[127:0], b[65:64])
dst[127:96] := SELECT4(a[127:0], b[97:96])
dst[159:128] := SELECT4(a[255:128], b[129:128])
dst[191:160] := SELECT4(a[255:128], b[161:160])
dst[223:192] := SELECT4(a[255:128], b[193:192])
dst[255:224] := SELECT4(a[255:128], b[225:224])
dst[287:256] := SELECT4(a[383:256], b[257:256])
dst[319:288] := SELECT4(a[383:256], b[289:288])
dst[351:320] := SELECT4(a[383:256], b[321:320])
dst[383:352] := SELECT4(a[383:256], b[353:352])
dst[415:384] := SELECT4(a[511:384], b[385:384])
dst[447:416] := SELECT4(a[511:384], b[417:416])
dst[479:448] := SELECT4(a[511:384], b[449:448])
dst[511:480] := SELECT4(a[511:384], b[481:480])
dst[MAX:512] := 0
vpermd
__m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx)
Synopsis
__m256i _mm256_permutevar8x32_epi32 (__m256i a, __m256i idx)
#include "immintrin.h"
Instruction: vpermd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
id := idx[i+2:i]*32
dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0
Performance
vpermps
__m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
Synopsis
__m256 _mm256_permutevar8x32_ps (__m256 a, __m256i idx)
#include "immintrin.h"
Instruction: vpermps ymm, ymm, ymm
CPUID Flags: AVX2
Description
Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
Operation
FOR j := 0 to 7
i := j*32
id := idx[i+2:i]*32
dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0
Performance
vpermq
__m256i _mm256_mask_permutex_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_mask_permutex_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a across lanes lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermq
__m256i _mm256_maskz_permutex_epi64 (__mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_maskz_permutex_epi64 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a across lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermq
__m256i _mm256_permutex_epi64 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_permutex_epi64 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a across lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0
vpermq
__m512i _mm512_mask_permutex_epi64 (__m512i src, __mmask8 k, __m512i a, const int imm8)
Synopsis
__m512i _mm512_mask_permutex_epi64 (__m512i src, __mmask8 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermq
__m512i _mm512_maskz_permutex_epi64 (__mmask8 k, __m512i a, const int imm8)
Synopsis
__m512i _mm512_maskz_permutex_epi64 (__mmask8 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermq
__m512i _mm512_permutex_epi64 (__m512i a, const int imm8)
Synopsis
__m512i _mm512_permutex_epi64 (__m512i a, const int imm8)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[319:256] := SELECT4(a[511:256], imm8[1:0])
dst[383:320] := SELECT4(a[511:256], imm8[3:2])
dst[447:384] := SELECT4(a[511:256], imm8[5:4])
dst[511:448] := SELECT4(a[511:256], imm8[7:6])
dst[MAX:512] := 0
vpermpd
__m256d _mm256_mask_permutex_pd (__m256d src, __mmask8 k, __m256d a, int imm8)
Synopsis
__m256d _mm256_mask_permutex_pd (__m256d src, __mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermpd
__m256d _mm256_maskz_permutex_pd (__mmask8 k, __m256d a, int imm8)
Synopsis
__m256d _mm256_maskz_permutex_pd (__mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermpd
__m256d _mm256_permutex_pd (__m256d a, int imm8)
Synopsis
__m256d _mm256_permutex_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0
vpermpd
__m512d _mm512_mask_permutex_pd (__m512d src, __mmask8 k, __m512d a, const int imm8)
Synopsis
__m512d _mm512_mask_permutex_pd (__m512d src, __mmask8 k, __m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermpd
__m512d _mm512_maskz_permutex_pd (__mmask8 k, __m512d a, const int imm8)
Synopsis
__m512d _mm512_maskz_permutex_pd (__mmask8 k, __m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
tmp_dst[63:0] := SELECT4(a[255:0], imm8[1:0])
tmp_dst[127:64] := SELECT4(a[255:0], imm8[3:2])
tmp_dst[191:128] := SELECT4(a[255:0], imm8[5:4])
tmp_dst[255:192] := SELECT4(a[255:0], imm8[7:6])
tmp_dst[319:256] := SELECT4(a[511:256], imm8[1:0])
tmp_dst[383:320] := SELECT4(a[511:256], imm8[3:2])
tmp_dst[447:384] := SELECT4(a[511:256], imm8[5:4])
tmp_dst[511:448] := SELECT4(a[511:256], imm8[7:6])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermpd
__m512d _mm512_permutex_pd (__m512d a, const int imm8)
Synopsis
__m512d _mm512_permutex_pd (__m512d a, const int imm8)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[63:0] := src[63:0]
1: tmp[63:0] := src[127:64]
2: tmp[63:0] := src[191:128]
3: tmp[63:0] := src[255:192]
ESAC
RETURN tmp[63:0]
}
dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[319:256] := SELECT4(a[511:256], imm8[1:0])
dst[383:320] := SELECT4(a[511:256], imm8[3:2])
dst[447:384] := SELECT4(a[511:256], imm8[5:4])
dst[511:448] := SELECT4(a[511:256], imm8[7:6])
dst[MAX:512] := 0
vpermt2w
__m128i _mm_mask_permutex2var_epi16 (__m128i a, __mmask8 k, __m128i idx, __m128i b)
Synopsis
__m128i _mm_mask_permutex2var_epi16 (__m128i a, __mmask8 k, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermt2w
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
off := 16*idx[i+2:i]
dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2w
__m128i _mm_mask2_permutex2var_epi16 (__m128i a, __m128i idx, __mmask8 k, __m128i b)
Synopsis
__m128i _mm_mask2_permutex2var_epi16 (__m128i a, __m128i idx, __mmask8 k, __m128i b)
#include "immintrin.h"
Instruction: vpermi2w
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
off := 16*idx[i+2:i]
dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
ELSE
dst[i+15:i] := idx[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2w, vpermt2w
__m128i _mm_maskz_permutex2var_epi16 (__mmask8 k, __m128i a, __m128i idx, __m128i b)
Synopsis
__m128i _mm_maskz_permutex2var_epi16 (__mmask8 k, __m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2w
vpermt2w
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
off := 16*idx[i+2:i]
dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermi2w, vpermt2w
__m128i _mm_permutex2var_epi16 (__m128i a, __m128i idx, __m128i b)
Synopsis
__m128i _mm_permutex2var_epi16 (__m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2w
vpermt2w
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
off := 16*idx[i+2:i]
dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
ENDFOR
dst[MAX:128] := 0
vpermt2w
__m256i _mm256_mask_permutex2var_epi16 (__m256i a, __mmask16 k, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_mask_permutex2var_epi16 (__m256i a, __mmask16 k, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermt2w
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
off := 16*idx[i+3:i]
dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2w
__m256i _mm256_mask2_permutex2var_epi16 (__m256i a, __m256i idx, __mmask16 k, __m256i b)
Synopsis
__m256i _mm256_mask2_permutex2var_epi16 (__m256i a, __m256i idx, __mmask16 k, __m256i b)
#include "immintrin.h"
Instruction: vpermi2w
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
off := 16*idx[i+3:i]
dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
ELSE
dst[i+15:i] := idx[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2w, vpermt2w
__m256i _mm256_maskz_permutex2var_epi16 (__mmask16 k, __m256i a, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_maskz_permutex2var_epi16 (__mmask16 k, __m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2w
vpermt2w
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
off := 16*idx[i+3:i]
dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermi2w, vpermt2w
__m256i _mm256_permutex2var_epi16 (__m256i a, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_permutex2var_epi16 (__m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2w
vpermt2w
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
off := 16*idx[i+3:i]
dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
ENDFOR
dst[MAX:256] := 0
vpermt2w
__m512i _mm512_mask_permutex2var_epi16 (__m512i a, __mmask32 k, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_mask_permutex2var_epi16 (__m512i a, __mmask32 k, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermt2w
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
off := 16*idx[i+4:i]
dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2w
__m512i _mm512_mask2_permutex2var_epi16 (__m512i a, __m512i idx, __mmask32 k, __m512i b)
Synopsis
__m512i _mm512_mask2_permutex2var_epi16 (__m512i a, __m512i idx, __mmask32 k, __m512i b)
#include "immintrin.h"
Instruction: vpermi2w
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
off := 16*idx[i+4:i]
dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
ELSE
dst[i+15:i] := idx[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2w, vpermt2w
__m512i _mm512_maskz_permutex2var_epi16 (__mmask32 k, __m512i a, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_maskz_permutex2var_epi16 (__mmask32 k, __m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2w
vpermt2w
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
off := 16*idx[i+4:i]
dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermi2w, vpermt2w
__m512i _mm512_permutex2var_epi16 (__m512i a, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_permutex2var_epi16 (__m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2w
vpermt2w
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
off := 16*idx[i+4:i]
dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
ENDFOR
dst[MAX:512] := 0
vpermt2d
__m128i _mm_mask_permutex2var_epi32 (__m128i a, __mmask8 k, __m128i idx, __m128i b)
Synopsis
__m128i _mm_mask_permutex2var_epi32 (__m128i a, __mmask8 k, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermt2d
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
off := idx[i+1:i]*32
IF k[j]
dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2d
__m128i _mm_mask2_permutex2var_epi32 (__m128i a, __m128i idx, __mmask8 k, __m128i b)
Synopsis
__m128i _mm_mask2_permutex2var_epi32 (__m128i a, __m128i idx, __mmask8 k, __m128i b)
#include "immintrin.h"
Instruction: vpermi2d
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
off := idx[i+1:i]*32
IF k[j]
dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := idx[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2d, vpermt2d
__m128i _mm_maskz_permutex2var_epi32 (__mmask8 k, __m128i a, __m128i idx, __m128i b)
Synopsis
__m128i _mm_maskz_permutex2var_epi32 (__mmask8 k, __m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2d
vpermt2d
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
off := idx[i+1:i]*32
IF k[j]
dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermi2d, vpermt2d
__m128i _mm_permutex2var_epi32 (__m128i a, __m128i idx, __m128i b)
Synopsis
__m128i _mm_permutex2var_epi32 (__m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2d
vpermt2d
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
off := idx[i+2:i]*32
dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:128] := 0
vpermt2d
__m256i _mm256_mask_permutex2var_epi32 (__m256i a, __mmask8 k, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_mask_permutex2var_epi32 (__m256i a, __mmask8 k, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermt2d
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
off := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2d
__m256i _mm256_mask2_permutex2var_epi32 (__m256i a, __m256i idx, __mmask8 k, __m256i b)
Synopsis
__m256i _mm256_mask2_permutex2var_epi32 (__m256i a, __m256i idx, __mmask8 k, __m256i b)
#include "immintrin.h"
Instruction: vpermi2d
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
off := idx[i+2:i]*32
IF k[j]
dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := idx[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2d, vpermt2d
__m256i _mm256_maskz_permutex2var_epi32 (__mmask8 k, __m256i a, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_maskz_permutex2var_epi32 (__mmask8 k, __m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2d
vpermt2d
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
off := idx[i+2:i]*32
IF k[j]
dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermi2d, vpermt2d
__m256i _mm256_permutex2var_epi32 (__m256i a, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_permutex2var_epi32 (__m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2d
vpermt2d
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
off := idx[i+2:i]*32
dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:256] := 0
vpermt2d
__m512i _mm512_mask_permutex2var_epi32 (__m512i a, __mmask16 k, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_mask_permutex2var_epi32 (__m512i a, __mmask16 k, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermt2d zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
off := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2d
__m512i _mm512_mask2_permutex2var_epi32 (__m512i a, __m512i idx, __mmask16 k, __m512i b)
Synopsis
__m512i _mm512_mask2_permutex2var_epi32 (__m512i a, __m512i idx, __mmask16 k, __m512i b)
#include "immintrin.h"
Instruction: vpermi2d zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
off := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := idx[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2d, vpermt2d
__m512i _mm512_maskz_permutex2var_epi32 (__mmask16 k, __m512i a, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_maskz_permutex2var_epi32 (__mmask16 k, __m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2d zmm {k}, zmm, zmm
vpermt2d zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
off := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermi2d, vpermt2d
__m512i _mm512_permutex2var_epi32 (__m512i a, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_permutex2var_epi32 (__m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2d zmm {k}, zmm, zmm
vpermt2d zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
off := idx[i+3:i]*32
dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:512] := 0
vpermt2q
__m128i _mm_mask_permutex2var_epi64 (__m128i a, __mmask8 k, __m128i idx, __m128i b)
Synopsis
__m128i _mm_mask_permutex2var_epi64 (__m128i a, __mmask8 k, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermt2q
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
off := idx[i]*64
IF k[j]
dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2q
__m128i _mm_mask2_permutex2var_epi64 (__m128i a, __m128i idx, __mmask8 k, __m128i b)
Synopsis
__m128i _mm_mask2_permutex2var_epi64 (__m128i a, __m128i idx, __mmask8 k, __m128i b)
#include "immintrin.h"
Instruction: vpermi2q
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
off := idx[i]*64
IF k[j]
dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := idx[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2q, vpermt2q
__m128i _mm_maskz_permutex2var_epi64 (__mmask8 k, __m128i a, __m128i idx, __m128i b)
Synopsis
__m128i _mm_maskz_permutex2var_epi64 (__mmask8 k, __m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2q
vpermt2q
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
off := idx[i]*64
IF k[j]
dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermi2q, vpermt2q
__m128i _mm_permutex2var_epi64 (__m128i a, __m128i idx, __m128i b)
Synopsis
__m128i _mm_permutex2var_epi64 (__m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2q
vpermt2q
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
off := idx[i]*64
dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:128] := 0
vpermt2q
__m256i _mm256_mask_permutex2var_epi64 (__m256i a, __mmask8 k, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_mask_permutex2var_epi64 (__m256i a, __mmask8 k, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermt2q
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
off := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2q
__m256i _mm256_mask2_permutex2var_epi64 (__m256i a, __m256i idx, __mmask8 k, __m256i b)
Synopsis
__m256i _mm256_mask2_permutex2var_epi64 (__m256i a, __m256i idx, __mmask8 k, __m256i b)
#include "immintrin.h"
Instruction: vpermi2q
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
off := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := idx[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2q, vpermt2q
__m256i _mm256_maskz_permutex2var_epi64 (__mmask8 k, __m256i a, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_maskz_permutex2var_epi64 (__mmask8 k, __m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2q
vpermt2q
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
off := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermi2q, vpermt2q
__m256i _mm256_permutex2var_epi64 (__m256i a, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_permutex2var_epi64 (__m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2q
vpermt2q
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
off := idx[i+1:i]*64
dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:256] := 0
vpermt2q
__m512i _mm512_mask_permutex2var_epi64 (__m512i a, __mmask8 k, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_mask_permutex2var_epi64 (__m512i a, __mmask8 k, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermt2q zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
off := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2q
__m512i _mm512_mask2_permutex2var_epi64 (__m512i a, __m512i idx, __mmask8 k, __m512i b)
Synopsis
__m512i _mm512_mask2_permutex2var_epi64 (__m512i a, __m512i idx, __mmask8 k, __m512i b)
#include "immintrin.h"
Instruction: vpermi2q zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
off := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := idx[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2q, vpermt2q
__m512i _mm512_maskz_permutex2var_epi64 (__mmask8 k, __m512i a, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_maskz_permutex2var_epi64 (__mmask8 k, __m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2q zmm {k}, zmm, zmm
vpermt2q zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
off := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermi2q, vpermt2q
__m512i _mm512_permutex2var_epi64 (__m512i a, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_permutex2var_epi64 (__m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2q zmm {k}, zmm, zmm
vpermt2q zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
off := idx[i+2:i]*64
dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:512] := 0
vpermt2b
__m128i _mm_mask_permutex2var_epi8 (__m128i a, __mmask16 k, __m128i idx, __m128i b)
Synopsis
__m128i _mm_mask_permutex2var_epi8 (__m128i a, __mmask16 k, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermt2b
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
off := 8*idx[i+3:i]
dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2b
__m128i _mm_mask2_permutex2var_epi8 (__m128i a, __m128i idx, __mmask16 k, __m128i b)
Synopsis
__m128i _mm_mask2_permutex2var_epi8 (__m128i a, __m128i idx, __mmask16 k, __m128i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
off := 8*idx[i+3:i]
dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2b, vpermt2b
__m128i _mm_maskz_permutex2var_epi8 (__mmask16 k, __m128i a, __m128i idx, __m128i b)
Synopsis
__m128i _mm_maskz_permutex2var_epi8 (__mmask16 k, __m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2b
vpermt2b
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
off := 8*idx[i+3:i]
dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermi2b
__m128i _mm_permutex2var_epi8 (__m128i a, __m128i idx, __m128i b)
Synopsis
__m128i _mm_permutex2var_epi8 (__m128i a, __m128i idx, __m128i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a and b using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
off := 8*idx[i+3:i]
dst[i+7:i] := idx[i+4] ? b[off+7:off] : a[off+7:off]
ENDFOR
dst[MAX:128] := 0
vpermt2b
__m256i _mm256_mask_permutex2var_epi8 (__m256i a, __mmask32 k, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_mask_permutex2var_epi8 (__m256i a, __mmask32 k, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermt2b
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
off := 8*idx[i+4:i]
dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2b
__m256i _mm256_mask2_permutex2var_epi8 (__m256i a, __m256i idx, __mmask32 k, __m256i b)
Synopsis
__m256i _mm256_mask2_permutex2var_epi8 (__m256i a, __m256i idx, __mmask32 k, __m256i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
off := 8*idx[i+4:i]
dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2b, vpermt2b
__m256i _mm256_maskz_permutex2var_epi8 (__mmask32 k, __m256i a, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_maskz_permutex2var_epi8 (__mmask32 k, __m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2b
vpermt2b
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
off := 8*idx[i+4:i]
dst[i+7:i] := idx[i+5] ? b[off+7:off] : a[off+7:off]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermi2b
__m256i _mm256_permutex2var_epi8 (__m256i a, __m256i idx, __m256i b)
Synopsis
__m256i _mm256_permutex2var_epi8 (__m256i a, __m256i idx, __m256i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
off := 8*idx[i+4:i]
dst[i+7:i] := idx[i+6] ? b[off+5:off] : a[off+7:off]
ENDFOR
dst[MAX:256] := 0
vpermt2b
__m512i _mm512_mask_permutex2var_epi8 (__m512i a, __mmask64 k, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_mask_permutex2var_epi8 (__m512i a, __mmask64 k, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermt2b
CPUID Flags: AVX512VBMI
Description
Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
off := 8*idx[i+5:i]
dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2b
__m512i _mm512_mask2_permutex2var_epi8 (__m512i a, __m512i idx, __mmask64 k, __m512i b)
Synopsis
__m512i _mm512_mask2_permutex2var_epi8 (__m512i a, __m512i idx, __mmask64 k, __m512i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI
Description
Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
off := 8*idx[i+5:i]
dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2b, vpermt2b
__m512i _mm512_maskz_permutex2var_epi8 (__mmask64 k, __m512i a, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_maskz_permutex2var_epi8 (__mmask64 k, __m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2b
vpermt2b
CPUID Flags: AVX512VBMI
Description
Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
off := 8*idx[i+5:i]
dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermi2b
__m512i _mm512_permutex2var_epi8 (__m512i a, __m512i idx, __m512i b)
Synopsis
__m512i _mm512_permutex2var_epi8 (__m512i a, __m512i idx, __m512i b)
#include "immintrin.h"
Instruction: vpermi2b
CPUID Flags: AVX512VBMI
Description
Shuffle 8-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
off := 8*idx[i+5:i]
dst[i+7:i] := idx[i+6] ? b[off+7:off] : a[off+7:off]
ENDFOR
dst[MAX:512] := 0
vpermt2pd
__m128d _mm_mask_permutex2var_pd (__m128d a, __mmask8 k, __m128i idx, __m128d b)
Synopsis
__m128d _mm_mask_permutex2var_pd (__m128d a, __mmask8 k, __m128i idx, __m128d b)
#include "immintrin.h"
Instruction: vpermt2pd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
off := idx[i]*64
IF k[j]
dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2pd
__m128d _mm_mask2_permutex2var_pd (__m128d a, __m128i idx, __mmask8 k, __m128d b)
Synopsis
__m128d _mm_mask2_permutex2var_pd (__m128d a, __m128i idx, __mmask8 k, __m128d b)
#include "immintrin.h"
Instruction: vpermi2pd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
Operation
FOR j := 0 to 1
i := j*64
off := idx[i]*64
IF k[j]
dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := idx[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2pd, vpermt2pd
__m128d _mm_maskz_permutex2var_pd (__mmask8 k, __m128d a, __m128i idx, __m128d b)
Synopsis
__m128d _mm_maskz_permutex2var_pd (__mmask8 k, __m128d a, __m128i idx, __m128d b)
#include "immintrin.h"
Instruction: vpermi2pd
vpermt2pd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
off := idx[i]*64
IF k[j]
dst[i+63:i] := (idx[i+1]) ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermi2pd, vpermt2pd
__m128d _mm_permutex2var_pd (__m128d a, __m128i idx, __m128d b)
Synopsis
__m128d _mm_permutex2var_pd (__m128d a, __m128i idx, __m128d b)
#include "immintrin.h"
Instruction: vpermi2pd
vpermt2pd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
off := idx[i]*64
dst[i+63:i] := idx[i+1] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:128] := 0
vpermt2pd
__m256d _mm256_mask_permutex2var_pd (__m256d a, __mmask8 k, __m256i idx, __m256d b)
Synopsis
__m256d _mm256_mask_permutex2var_pd (__m256d a, __mmask8 k, __m256i idx, __m256d b)
#include "immintrin.h"
Instruction: vpermt2pd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
off := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2pd
__m256d _mm256_mask2_permutex2var_pd (__m256d a, __m256i idx, __mmask8 k, __m256d b)
Synopsis
__m256d _mm256_mask2_permutex2var_pd (__m256d a, __m256i idx, __mmask8 k, __m256d b)
#include "immintrin.h"
Instruction: vpermi2pd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
off := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := idx[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2pd, vpermt2pd
__m256d _mm256_maskz_permutex2var_pd (__mmask8 k, __m256d a, __m256i idx, __m256d b)
Synopsis
__m256d _mm256_maskz_permutex2var_pd (__mmask8 k, __m256d a, __m256i idx, __m256d b)
#include "immintrin.h"
Instruction: vpermi2pd
vpermt2pd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
off := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := (idx[i+2]) ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermi2pd, vpermt2pd
__m256d _mm256_permutex2var_pd (__m256d a, __m256i idx, __m256d b)
Synopsis
__m256d _mm256_permutex2var_pd (__m256d a, __m256i idx, __m256d b)
#include "immintrin.h"
Instruction: vpermi2pd
vpermt2pd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
off := idx[i+1:i]*64
dst[i+63:i] := idx[i+2] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:256] := 0
vpermt2pd
__m512d _mm512_mask_permutex2var_pd (__m512d a, __mmask8 k, __m512i idx, __m512d b)
Synopsis
__m512d _mm512_mask_permutex2var_pd (__m512d a, __mmask8 k, __m512i idx, __m512d b)
#include "immintrin.h"
Instruction: vpermt2pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
off := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := a[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2pd
__m512d _mm512_mask2_permutex2var_pd (__m512d a, __m512i idx, __mmask8 k, __m512d b)
Synopsis
__m512d _mm512_mask2_permutex2var_pd (__m512d a, __m512i idx, __mmask8 k, __m512d b)
#include "immintrin.h"
Instruction: vpermi2pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
Operation
FOR j := 0 to 7
i := j*64
off := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := idx[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2pd, vpermt2pd
__m512d _mm512_maskz_permutex2var_pd (__mmask8 k, __m512d a, __m512i idx, __m512d b)
Synopsis
__m512d _mm512_maskz_permutex2var_pd (__mmask8 k, __m512d a, __m512i idx, __m512d b)
#include "immintrin.h"
Instruction: vpermi2pd zmm {k}, zmm, zmm
vpermt2pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
off := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := (idx[i+3]) ? b[off+63:off] : a[off+63:off]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermi2pd, vpermt2pd
__m512d _mm512_permutex2var_pd (__m512d a, __m512i idx, __m512d b)
Synopsis
__m512d _mm512_permutex2var_pd (__m512d a, __m512i idx, __m512d b)
#include "immintrin.h"
Instruction: vpermi2pd zmm {k}, zmm, zmm
vpermt2pd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
off := idx[i+2:i]*64
dst[i+63:i] := idx[i+3] ? b[off+63:off] : a[off+63:off]
ENDFOR
dst[MAX:512] := 0
vpermt2ps
__m128 _mm_mask_permutex2var_ps (__m128 a, __mmask8 k, __m128i idx, __m128 b)
Synopsis
__m128 _mm_mask_permutex2var_ps (__m128 a, __mmask8 k, __m128i idx, __m128 b)
#include "immintrin.h"
Instruction: vpermt2ps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
off := idx[i+1:i]*32
IF k[j]
dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2ps
__m128 _mm_mask2_permutex2var_ps (__m128 a, __m128i idx, __mmask8 k, __m128 b)
Synopsis
__m128 _mm_mask2_permutex2var_ps (__m128 a, __m128i idx, __mmask8 k, __m128 b)
#include "immintrin.h"
Instruction: vpermi2ps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
off := idx[i+1:i]*32
IF k[j]
dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := idx[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermi2ps, vpermt2ps
__m128 _mm_maskz_permutex2var_ps (__mmask8 k, __m128 a, __m128i idx, __m128 b)
Synopsis
__m128 _mm_maskz_permutex2var_ps (__mmask8 k, __m128 a, __m128i idx, __m128 b)
#include "immintrin.h"
Instruction: vpermi2ps
vpermt2ps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
off := idx[i+1:i]*32
IF k[j]
dst[i+31:i] := (idx[i+2]) ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermi2ps, vpermt2ps
__m128 _mm_permutex2var_ps (__m128 a, __m128i idx, __m128 b)
Synopsis
__m128 _mm_permutex2var_ps (__m128 a, __m128i idx, __m128 b)
#include "immintrin.h"
Instruction: vpermi2ps
vpermt2ps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
off := idx[i+1:i]*32
dst[i+31:i] := idx[i+2] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:128] := 0
vpermt2ps
__m256 _mm256_mask_permutex2var_ps (__m256 a, __mmask8 k, __m256i idx, __m256 b)
Synopsis
__m256 _mm256_mask_permutex2var_ps (__m256 a, __mmask8 k, __m256i idx, __m256 b)
#include "immintrin.h"
Instruction: vpermt2ps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
off := idx[i+2:i]*32
IF k[j]
dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2ps
__m256 _mm256_mask2_permutex2var_ps (__m256 a, __m256i idx, __mmask8 k, __m256 b)
Synopsis
__m256 _mm256_mask2_permutex2var_ps (__m256 a, __m256i idx, __mmask8 k, __m256 b)
#include "immintrin.h"
Instruction: vpermi2ps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
off := idx[i+2:i]*32
IF k[j]
dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := idx[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermi2ps, vpermt2ps
__m256 _mm256_maskz_permutex2var_ps (__mmask8 k, __m256 a, __m256i idx, __m256 b)
Synopsis
__m256 _mm256_maskz_permutex2var_ps (__mmask8 k, __m256 a, __m256i idx, __m256 b)
#include "immintrin.h"
Instruction: vpermi2ps
vpermt2ps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
off := idx[i+2:i]*32
IF k[j]
dst[i+31:i] := (idx[i+3]) ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermi2ps, vpermt2ps
__m256 _mm256_permutex2var_ps (__m256 a, __m256i idx, __m256 b)
Synopsis
__m256 _mm256_permutex2var_ps (__m256 a, __m256i idx, __m256 b)
#include "immintrin.h"
Instruction: vpermi2ps
vpermt2ps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
off := idx[i+2:i]*32
dst[i+31:i] := idx[i+3] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:256] := 0
vpermt2ps
__m512 _mm512_mask_permutex2var_ps (__m512 a, __mmask16 k, __m512i idx, __m512 b)
Synopsis
__m512 _mm512_mask_permutex2var_ps (__m512 a, __mmask16 k, __m512i idx, __m512 b)
#include "immintrin.h"
Instruction: vpermt2ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
off := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2ps
__m512 _mm512_mask2_permutex2var_ps (__m512 a, __m512i idx, __mmask16 k, __m512 b)
Synopsis
__m512 _mm512_mask2_permutex2var_ps (__m512 a, __m512i idx, __mmask16 k, __m512 b)
#include "immintrin.h"
Instruction: vpermi2ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
off := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := idx[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermi2ps, vpermt2ps
__m512 _mm512_maskz_permutex2var_ps (__mmask16 k, __m512 a, __m512i idx, __m512 b)
Synopsis
__m512 _mm512_maskz_permutex2var_ps (__mmask16 k, __m512 a, __m512i idx, __m512 b)
#include "immintrin.h"
Instruction: vpermi2ps zmm {k}, zmm, zmm
vpermt2ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
off := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := (idx[i+4]) ? b[off+31:off] : a[off+31:off]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermi2ps, vpermt2ps
__m512 _mm512_permutex2var_ps (__m512 a, __m512i idx, __m512 b)
Synopsis
__m512 _mm512_permutex2var_ps (__m512 a, __m512i idx, __m512 b)
#include "immintrin.h"
Instruction: vpermi2ps zmm {k}, zmm, zmm
vpermt2ps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
off := idx[i+3:i]*32
dst[i+31:i] := idx[i+4] ? b[off+31:off] : a[off+31:off]
ENDFOR
dst[MAX:512] := 0
vpermw
__m128i _mm_mask_permutexvar_epi16 (__m128i src, __mmask8 k, __m128i idx, __m128i a)
Synopsis
__m128i _mm_mask_permutexvar_epi16 (__m128i src, __mmask8 k, __m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
id := idx[i+2:i]*16
IF k[j]
dst[i+15:i] := a[id+15:id]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermw
__m128i _mm_maskz_permutexvar_epi16 (__mmask8 k, __m128i idx, __m128i a)
Synopsis
__m128i _mm_maskz_permutexvar_epi16 (__mmask8 k, __m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
id := idx[i+2:i]*16
IF k[j]
dst[i+15:i] := a[id+15:id]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermw
__m128i _mm_permutexvar_epi16 (__m128i idx, __m128i a)
Synopsis
__m128i _mm_permutexvar_epi16 (__m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
id := idx[i+2:i]*16
dst[i+15:i] := a[id+15:id]
ENDFOR
dst[MAX:128] := 0
vpermw
__m256i _mm256_mask_permutexvar_epi16 (__m256i src, __mmask16 k, __m256i idx, __m256i a)
Synopsis
__m256i _mm256_mask_permutexvar_epi16 (__m256i src, __mmask16 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
id := idx[i+3:i]*16
IF k[j]
dst[i+15:i] := a[id+15:id]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermw
__m256i _mm256_maskz_permutexvar_epi16 (__mmask16 k, __m256i idx, __m256i a)
Synopsis
__m256i _mm256_maskz_permutexvar_epi16 (__mmask16 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
id := idx[i+3:i]*16
IF k[j]
dst[i+15:i] := a[id+15:id]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermw
__m256i _mm256_permutexvar_epi16 (__m256i idx, __m256i a)
Synopsis
__m256i _mm256_permutexvar_epi16 (__m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
id := idx[i+3:i]*16
dst[i+15:i] := a[id+15:id]
ENDFOR
dst[MAX:256] := 0
vpermw
__m512i _mm512_mask_permutexvar_epi16 (__m512i src, __mmask32 k, __m512i idx, __m512i a)
Synopsis
__m512i _mm512_mask_permutexvar_epi16 (__m512i src, __mmask32 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
id := idx[i+4:i]*16
IF k[j]
dst[i+15:i] := a[id+15:id]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermw
__m512i _mm512_maskz_permutexvar_epi16 (__mmask32 k, __m512i idx, __m512i a)
Synopsis
__m512i _mm512_maskz_permutexvar_epi16 (__mmask32 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
id := idx[i+4:i]*16
IF k[j]
dst[i+15:i] := a[id+15:id]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermw
__m512i _mm512_permutexvar_epi16 (__m512i idx, __m512i a)
Synopsis
__m512i _mm512_permutexvar_epi16 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermw
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
id := idx[i+4:i]*16
dst[i+15:i] := a[id+15:id]
ENDFOR
dst[MAX:512] := 0
vpermd
__m256i _mm256_mask_permutexvar_epi32 (__m256i src, __mmask8 k, __m256i idx, __m256i a)
Synopsis
__m256i _mm256_mask_permutexvar_epi32 (__m256i src, __mmask8 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
id := idx[i+2:i]*32
IF k[j]
dst[i+31:i] := a[id+31:id]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermd
__m256i _mm256_maskz_permutexvar_epi32 (__mmask8 k, __m256i idx, __m256i a)
Synopsis
__m256i _mm256_maskz_permutexvar_epi32 (__mmask8 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
id := idx[i+2:i]*32
IF k[j]
dst[i+31:i] := a[id+31:id]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermd
__m256i _mm256_permutexvar_epi32 (__m256i idx, __m256i a)
Synopsis
__m256i _mm256_permutexvar_epi32 (__m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
id := idx[i+2:i]*32
dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0
vpermd
__m512i _mm512_mask_permutexvar_epi32 (__m512i src, __mmask16 k, __m512i idx, __m512i a)
Synopsis
__m512i _mm512_mask_permutexvar_epi32 (__m512i src, __mmask16 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
id := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := a[id+31:id]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermd
__m512i _mm512_maskz_permutexvar_epi32 (__mmask16 k, __m512i idx, __m512i a)
Synopsis
__m512i _mm512_maskz_permutexvar_epi32 (__mmask16 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
id := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := a[id+31:id]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermd
__m512i _mm512_permutexvar_epi32 (__m512i idx, __m512i a)
Synopsis
__m512i _mm512_permutexvar_epi32 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
id := idx[i+3:i]*32
dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:512] := 0
vpermq
__m256i _mm256_mask_permutexvar_epi64 (__m256i src, __mmask8 k, __m256i idx, __m256i a)
Synopsis
__m256i _mm256_mask_permutexvar_epi64 (__m256i src, __mmask8 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
id := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := a[id+63:id]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermq
__m256i _mm256_maskz_permutexvar_epi64 (__mmask8 k, __m256i idx, __m256i a)
Synopsis
__m256i _mm256_maskz_permutexvar_epi64 (__mmask8 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
id := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := a[id+63:id]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermq
__m256i _mm256_permutexvar_epi64 (__m256i idx, __m256i a)
Synopsis
__m256i _mm256_permutexvar_epi64 (__m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermq
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
id := idx[i+1:i]*64
dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:256] := 0
vpermq
__m512i _mm512_mask_permutexvar_epi64 (__m512i src, __mmask8 k, __m512i idx, __m512i a)
Synopsis
__m512i _mm512_mask_permutexvar_epi64 (__m512i src, __mmask8 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
id := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := a[id+63:id]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermq
__m512i _mm512_maskz_permutexvar_epi64 (__mmask8 k, __m512i idx, __m512i a)
Synopsis
__m512i _mm512_maskz_permutexvar_epi64 (__mmask8 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
id := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := a[id+63:id]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermq
__m512i _mm512_permutexvar_epi64 (__m512i idx, __m512i a)
Synopsis
__m512i _mm512_permutexvar_epi64 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
id := idx[i+2:i]*64
dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:512] := 0
vpermb
__m128i _mm_mask_permutexvar_epi8 (__m128i src, __mmask16 k, __m128i idx, __m128i a)
Synopsis
__m128i _mm_mask_permutexvar_epi8 (__m128i src, __mmask16 k, __m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
id := idx[i+3:i]*8
IF k[j]
dst[i+7:i] := a[id+7:id]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpermb
__m128i _mm_maskz_permutexvar_epi8 (__mmask16 k, __m128i idx, __m128i a)
Synopsis
__m128i _mm_maskz_permutexvar_epi8 (__mmask16 k, __m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
id := idx[i+3:i]*8
IF k[j]
dst[i+7:i] := a[id+7:id]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpermb
__m128i _mm_permutexvar_epi8 (__m128i idx, __m128i a)
Synopsis
__m128i _mm_permutexvar_epi8 (__m128i idx, __m128i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
id := idx[i+3:i]*8
dst[i+7:i] := a[id+7:id]
ENDFOR
dst[MAX:128] := 0
vpermb
__m256i _mm256_mask_permutexvar_epi8 (__m256i src, __mmask32 k, __m256i idx, __m256i a)
Synopsis
__m256i _mm256_mask_permutexvar_epi8 (__m256i src, __mmask32 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
id := idx[i+4:i]*8
IF k[j]
dst[i+7:i] := a[id+7:id]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermb
__m256i _mm256_maskz_permutexvar_epi8 (__mmask32 k, __m256i idx, __m256i a)
Synopsis
__m256i _mm256_maskz_permutexvar_epi8 (__mmask32 k, __m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
id := idx[i+4:i]*8
IF k[j]
dst[i+7:i] := a[id+7:id]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermb
__m256i _mm256_permutexvar_epi8 (__m256i idx, __m256i a)
Synopsis
__m256i _mm256_permutexvar_epi8 (__m256i idx, __m256i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI + AVX512VL
Description
Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
id := idx[i+4:i]*8
dst[i+7:i] := a[id+7:id]
ENDFOR
dst[MAX:256] := 0
vpermb
__m512i _mm512_mask_permutexvar_epi8 (__m512i src, __mmask64 k, __m512i idx, __m512i a)
Synopsis
__m512i _mm512_mask_permutexvar_epi8 (__m512i src, __mmask64 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI
Description
Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
id := idx[i+5:i]*8
IF k[j]
dst[i+7:i] := a[id+7:id]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermb
__m512i _mm512_maskz_permutexvar_epi8 (__mmask64 k, __m512i idx, __m512i a)
Synopsis
__m512i _mm512_maskz_permutexvar_epi8 (__mmask64 k, __m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI
Description
Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
id := idx[i+5:i]*8
IF k[j]
dst[i+7:i] := a[id+7:id]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermb
__m512i _mm512_permutexvar_epi8 (__m512i idx, __m512i a)
Synopsis
__m512i _mm512_permutexvar_epi8 (__m512i idx, __m512i a)
#include "immintrin.h"
Instruction: vpermb
CPUID Flags: AVX512VBMI
Description
Shuffle 8-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
id := idx[i+5:i]*8
dst[i+7:i] := a[id+7:id]
ENDFOR
dst[MAX:512] := 0
vpermpd
__m256d _mm256_mask_permutexvar_pd (__m256d src, __mmask8 k, __m256i idx, __m256d a)
Synopsis
__m256d _mm256_mask_permutexvar_pd (__m256d src, __mmask8 k, __m256i idx, __m256d a)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
id := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := a[id+63:id]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermpd
__m256d _mm256_maskz_permutexvar_pd (__mmask8 k, __m256i idx, __m256d a)
Synopsis
__m256d _mm256_maskz_permutexvar_pd (__mmask8 k, __m256i idx, __m256d a)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
id := idx[i+1:i]*64
IF k[j]
dst[i+63:i] := a[id+63:id]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermpd
__m256d _mm256_permutexvar_pd (__m256i idx, __m256d a)
Synopsis
__m256d _mm256_permutexvar_pd (__m256i idx, __m256d a)
#include "immintrin.h"
Instruction: vpermpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
id := idx[i+1:i]*64
dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:256] := 0
vpermpd
__m512d _mm512_mask_permutexvar_pd (__m512d src, __mmask8 k, __m512i idx, __m512d a)
Synopsis
__m512d _mm512_mask_permutexvar_pd (__m512d src, __mmask8 k, __m512i idx, __m512d a)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
id := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := a[id+63:id]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermpd
__m512d _mm512_maskz_permutexvar_pd (__mmask8 k, __m512i idx, __m512d a)
Synopsis
__m512d _mm512_maskz_permutexvar_pd (__mmask8 k, __m512i idx, __m512d a)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
id := idx[i+2:i]*64
IF k[j]
dst[i+63:i] := a[id+63:id]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermpd
__m512d _mm512_permutexvar_pd (__m512i idx, __m512d a)
Synopsis
__m512d _mm512_permutexvar_pd (__m512i idx, __m512d a)
#include "immintrin.h"
Instruction: vpermpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
id := idx[i+2:i]*64
dst[i+63:i] := a[id+63:id]
ENDFOR
dst[MAX:512] := 0
vpermps
__m256 _mm256_mask_permutexvar_ps (__m256 src, __mmask8 k, __m256i idx, __m256 a)
Synopsis
__m256 _mm256_mask_permutexvar_ps (__m256 src, __mmask8 k, __m256i idx, __m256 a)
#include "immintrin.h"
Instruction: vpermps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
id := idx[i+2:i]*32
IF k[j]
dst[i+31:i] := a[id+31:id]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpermps
__m256 _mm256_maskz_permutexvar_ps (__mmask8 k, __m256i idx, __m256 a)
Synopsis
__m256 _mm256_maskz_permutexvar_ps (__mmask8 k, __m256i idx, __m256 a)
#include "immintrin.h"
Instruction: vpermps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
id := idx[i+2:i]*32
IF k[j]
dst[i+31:i] := a[id+31:id]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpermps
__m256 _mm256_permutexvar_ps (__m256i idx, __m256 a)
Synopsis
__m256 _mm256_permutexvar_ps (__m256i idx, __m256 a)
#include "immintrin.h"
Instruction: vpermps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
Operation
FOR j := 0 to 7
i := j*32
id := idx[i+2:i]*32
dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0
vpermps
__m512 _mm512_mask_permutexvar_ps (__m512 src, __mmask16 k, __m512i idx, __m512 a)
Synopsis
__m512 _mm512_mask_permutexvar_ps (__m512 src, __mmask16 k, __m512i idx, __m512 a)
#include "immintrin.h"
Instruction: vpermps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
id := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := a[id+31:id]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpermps
__m512 _mm512_maskz_permutexvar_ps (__mmask16 k, __m512i idx, __m512 a)
Synopsis
__m512 _mm512_maskz_permutexvar_ps (__mmask16 k, __m512i idx, __m512 a)
#include "immintrin.h"
Instruction: vpermps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
id := idx[i+3:i]*32
IF k[j]
dst[i+31:i] := a[id+31:id]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpermps
__m512 _mm512_permutexvar_ps (__m512i idx, __m512 a)
Synopsis
__m512 _mm512_permutexvar_ps (__m512i idx, __m512 a)
#include "immintrin.h"
Instruction: vpermps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
Operation
FOR j := 0 to 15
i := j*32
id := idx[i+3:i]*32
dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:512] := 0
pext
unsigned int _pext_u32 (unsigned int a, unsigned int mask)
Synopsis
unsigned int _pext_u32 (unsigned int a, unsigned int mask)
#include "immintrin.h"
Instruction: pext r32, r32, r32
CPUID Flags: BMI2
Description
Extract bits from unsigned 32-bit integer a at the corresponding bit locations specified by mask to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
Operation
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m < 32
IF mask[m] = 1
dst[k] := tmp[m]
k := k + 1
FI
m := m + 1
OD
Performance
pext
unsigned __int64 _pext_u64 (unsigned __int64 a, unsigned __int64 mask)
Synopsis
unsigned __int64 _pext_u64 (unsigned __int64 a, unsigned __int64 mask)
#include "immintrin.h"
Instruction: pext r64, r64, r64
CPUID Flags: BMI2
Description
Extract bits from unsigned 64-bit integer a at the corresponding bit locations specified by mask to contiguous low bits in dst; the remaining upper bits in dst are set to zero.
Operation
tmp := a
dst := 0
m := 0
k := 0
DO WHILE m < 64
IF mask[m] = 1
dst[k] := tmp[m]
k := k + 1
FI
m := m + 1
OD
Performance
pextrw
int _m_pextrw (__m64 a, int imm8)
Synopsis
int _m_pextrw (__m64 a, int imm8)
#include "xmmintrin.h"
Instruction: pextrw r32, mm, imm
CPUID Flags: SSE
Description
Extract a 16-bit integer from a, selected with imm8, and store the result in the lower element of dst.
Operation
dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0]
dst[31:16] := 0
pinsrw
__m64 _m_pinsrw (__m64 a, int i, int imm8)
Synopsis
__m64 _m_pinsrw (__m64 a, int i, int imm8)
#include "xmmintrin.h"
Instruction: pinsrw mm, r32, imm
CPUID Flags: SSE
Description
Copy a to dst, and insert the 16-bit integer i into dst at the location specified by imm8.
Operation
dst[63:0] := a[63:0]
sel := imm8[1:0]*16
dst[sel+15:sel] := i[15:0]
pmaxsw
__m64 _m_pmaxsw (__m64 a, __m64 b)
Synopsis
__m64 _m_pmaxsw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmaxsw mm, mm
CPUID Flags: SSE
Description
Compare packed 16-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 3
i := j*16
IF a[i+15:i] > b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
Performance
pmaxub
__m64 _m_pmaxub (__m64 a, __m64 b)
Synopsis
__m64 _m_pmaxub (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmaxub mm, mm
CPUID Flags: SSE
Description
Compare packed unsigned 8-bit integers in a and b, and store packed maximum values in dst.
Operation
FOR j := 0 to 7
i := j*8
IF a[i+7:i] > b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
Performance
pminsw
__m64 _m_pminsw (__m64 a, __m64 b)
Synopsis
__m64 _m_pminsw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pminsw mm, mm
CPUID Flags: SSE
Description
Compare packed 16-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 3
i := j*16
IF a[i+15:i] < b[i+15:i]
dst[i+15:i] := a[i+15:i]
ELSE
dst[i+15:i] := b[i+15:i]
FI
ENDFOR
Performance
pminub
__m64 _m_pminub (__m64 a, __m64 b)
Synopsis
__m64 _m_pminub (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pminub mm, mm
CPUID Flags: SSE
Description
Compare packed unsigned 8-bit integers in a and b, and store packed minimum values in dst.
Operation
FOR j := 0 to 7
i := j*8
IF a[i+7:i] < b[i+7:i]
dst[i+7:i] := a[i+7:i]
ELSE
dst[i+7:i] := b[i+7:i]
FI
ENDFOR
Performance
pmovmskb
int _m_pmovmskb (__m64 a)
Synopsis
int _m_pmovmskb (__m64 a)
#include "xmmintrin.h"
Instruction: pmovmskb r32, mm
CPUID Flags: SSE
Description
Create mask from the most significant bit of each 8-bit element in a, and store the result in dst.
Operation
FOR j := 0 to 7
i := j*8
dst[j] := a[i+7]
ENDFOR
dst[MAX:8] := 0
pmulhuw
__m64 _m_pmulhuw (__m64 a, __m64 b)
Synopsis
__m64 _m_pmulhuw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: pmulhuw mm, mm
CPUID Flags: SSE
Description
Multiply the packed unsigned 16-bit integers in a and b, producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in dst.
Operation
FOR j := 0 to 3
i := j*16
tmp[31:0] := a[i+15:i] * b[i+15:i]
dst[i+15:i] := tmp[31:16]
ENDFOR
popcnt
int _mm_popcnt_u32 (unsigned int a)
Synopsis
int _mm_popcnt_u32 (unsigned int a)
#include "nmmintrin.h"
Instruction: popcnt r32, r32
CPUID Flags: POPCNT
Description
Count the number of bits set to 1 in unsigned 32-bit integer a, and return that count in dst.
Operation
dst := 0
FOR i := 0 to 31
IF a[i]
dst := dst + 1
FI
ENDFOR
Performance
popcnt
__int64 _mm_popcnt_u64 (unsigned __int64 a)
Synopsis
__int64 _mm_popcnt_u64 (unsigned __int64 a)
#include "nmmintrin.h"
Instruction: popcnt r64, r64
CPUID Flags: POPCNT
Description
Count the number of bits set to 1 in unsigned 64-bit integer a, and return that count in dst.
Operation
dst := 0
FOR i := 0 to 63
IF a[i]
dst := dst + 1
FI
ENDFOR
Performance
popcnt
int _popcnt32 (int a)
Synopsis
int _popcnt32 (int a)
#include "immintrin.h"
Instruction: popcnt r32, r32
CPUID Flags: POPCNT
Description
Count the number of bits set to 1 in 32-bit integer a, and return that count in dst.
Operation
dst := 0
FOR i := 0 to 31
IF a[i]
dst := dst + 1
FI
ENDFOR
Performance
popcnt
int _popcnt64 (__int64 a)
Synopsis
int _popcnt64 (__int64 a)
#include "immintrin.h"
Instruction: popcnt r64, r64
CPUID Flags: POPCNT
Description
Count the number of bits set to 1 in 64-bit integer a, and return that count in dst.
Operation
dst := 0
FOR i := 0 to 63
IF a[i]
dst := dst + 1
FI
ENDFOR
Performance
...
__m128d _mm_pow_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_pow_pd (__m128d a, __m128d b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of packed double-precision (64-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_pow_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_pow_pd (__m256d a, __m256d b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of packed double-precision (64-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_pow_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_pow_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of packed double-precision (64-bit) floating-point elements in a raised by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_pow_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_pow_pd (__m512d a, __m512d b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of packed double-precision (64-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_pow_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_pow_ps (__m128 a, __m128 b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the exponential value of packed single-precision (32-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_pow_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_pow_ps (__m256 a, __m256 b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the exponential value of packed single-precision (32-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_pow_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_pow_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of packed single-precision (32-bit) floating-point elements in a raised by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_pow_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_pow_ps (__m512 a, __m512 b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the exponential value of packed single-precision (32-bit) floating-point elements in a raised by packed elements in b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
ENDFOR
dst[MAX:512] := 0
prefetchnta, prefetcht0, prefetcht1, prefetcht2
void _mm_prefetch (char const* p, int i)
Synopsis
void _mm_prefetch (char const* p, int i)
#include "xmmintrin.h"
Instruction: prefetchnta mprefetch
prefetcht0 mprefetch
prefetcht1 mprefetch
prefetcht2 mprefetch
CPUID Flags: SSE
Description
Fetch the line of data from memory that contains address p to a location in the cache heirarchy specified by the locality hint i.
prefetchwt1
void _mm_prefetch (char const* p, int i)
Synopsis
void _mm_prefetch (char const* p, int i)
#include "xmmintrin.h"
Instruction: prefetchwt1 mprefetch
CPUID Flags: PREFETCHWT1
Description
Fetch the line of data from memory that contains address p to a location in the cache heirarchy specified by the locality hint i.
vprefetch0, vprefetch1, vprefetch2, vprefetchnta, vprefetche0, vprefetche1, vprefetche2, vprefetchenta
void _mm_prefetch (char const* p, int i)
Synopsis
void _mm_prefetch (char const* p, int i)
#include "xmmintrin.h"
Instruction: vprefetch0 mprefetch
vprefetch1 mprefetch
vprefetch2 mprefetch
vprefetchnta mprefetch
vprefetche0 mprefetch
vprefetche1 mprefetch
vprefetche2 mprefetch
vprefetchenta mprefetch
CPUID Flags: KNCNI
Description
Fetch the line of data from memory that contains address p to a location in the cache heirarchy specified by the locality hint i.
vgatherpf0dps, vgatherpf1dps
void _mm512_mask_prefetch_i32extgather_ps (__m512i index, __mmask16 k, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i32extgather_ps (__m512i index, __mmask16 k, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dps m512 {k}
vgatherpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC
Description
Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address mv and 32-bit integer index vector index with scale scale to L1 or L2 level of cache depending on the value of hint. Gathered elements are merged in cache using writemask k (elements are brought into cache only when their corresponding mask bits are set). The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
The conv parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the conv parameter specified for the subsequent gather intrinsic.
Operation
FOR j := 0 to 15
addr := MEM[mv + index[j] * scale]
i := j*32
IF k[j] THEN
CASE hint OF
_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
ESAC
FI
ENDFOR
dst[MAX:512] := 0
vgatherpf0dps, vgatherpf1dps
void _mm512_prefetch_i32extgather_ps (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
Synopsis
void _mm512_prefetch_i32extgather_ps (__m512i index, void const * mv, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dps m512 {k}
vgatherpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC
Description
Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address mv and 32-bit integer index vector index with scale scale to L1 or L2 level of cache depending on the value of hint. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
The conv parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the conv parameter specified for the subsequent gather intrinsic.
Operation
FOR j := 0 to 15
addr := MEM[mv + index[j] * scale]
i := j*32
CASE hint OF
_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
ESAC
ENDFOR
dst[MAX:512] := 0
vscatterpf0dps, vscatterpf1dps
void _mm512_mask_prefetch_i32extscatter_ps (void * mv, __mmask16 k, __m512i index, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i32extscatter_ps (void * mv, __mmask16 k, __m512i index, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dps m512 {k}
vscatterpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC
Description
Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address mv and 32-bit integer index vector index with scale scale to L1 or L2 level of cache depending on the value of hint. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
The conv parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the conv parameter specified for the subsequent gather intrinsic. Only those elements whose corresponding mask bit in k is set are loaded into cache.
Operation
cachev := 0
FOR j := 0 to 15
i := j*32
IF k[j]
addr := MEM[mv + index[j] * scale]
CASE hint OF
_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
ESAC
FI
ENDFOR
vscatterpf0dps, vscatterpf1dps
void _mm512_prefetch_i32extscatter_ps (void * mv, __m512i index, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
Synopsis
void _mm512_prefetch_i32extscatter_ps (void * mv, __m512i index, _MM_UPCONV_PS_ENUM conv, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dps m512 {k}
vscatterpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC
Description
Prefetches a set of 16 single-precision (32-bit) memory locations pointed by base address mv and 32-bit integer index vector index with scale scale to L1 or L2 level of cache depending on the value of hint, with a request for exclusive ownership. The hint parameter may be one of the following: _MM_HINT_T0 = 1 for prefetching to L1 cache, _MM_HINT_T1 = 2 for prefetching to L2 cache, _MM_HINT_T2 = 3 for prefetching to L2 cache non-temporal, _MM_HINT_NTA = 0 for prefetching to L1 cache non-temporal. The conv parameter specifies the granularity used by compilers to better encode the instruction. It should be the same as the conv parameter specified for the subsequent scatter intrinsic.
Operation
cachev := 0
FOR j := 0 to 15
i := j*32
addr := MEM[mv + index[j] * scale]
CASE hint OF
_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
_MM_HINT_T2: PrefetchL2WithT1HintNonTemporal(addr[i+31:i])
_MM_HINT_NTA: PrefetchL1WithT0HintNonTemporal(addr[i+31:i])
ESAC
ENDFOR
vgatherpf0dpd, vgatherpf1dpd
void _mm512_mask_prefetch_i32gather_pd (__m256i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i32gather_pd (__m256i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dpd vm32y {k}
vgatherpf1dpd vm32y {k}
CPUID Flags: AVX512PF
Description
Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache using writemask k (elements are brought into cache only when their corresponding mask bits are set). scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
Operation
FOR j := 0 to 7
i := j*32;
IF mask[j] THEN
Prefetch([base_addr + SignExtend(vindex[i*31:i]) * scale], hint, RFO=0);
FI
ENDFOR;
vgatherpf0dpd, vgatherpf1dpd
void _mm512_prefetch_i32gather_pd (__m256i vindex, void const* base_addr, int scale, int hint)
Synopsis
void _mm512_prefetch_i32gather_pd (__m256i vindex, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dpd vm32y {k}
vgatherpf1dpd vm32y {k}
CPUID Flags: AVX512PF
Description
Prefetch double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache. scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
Operation
FOR j := 0 to 7
i := j*32;
Prefetch([base_addr + SignExtend(vindex[i*31:i]) * scale], hint, RFO=0);
ENDFOR;
vgatherpf0dps, vgatherpf1dps
void _mm512_mask_prefetch_i32gather_ps (__m512i vindex, __mmask16 mask, void const* base_addr, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i32gather_ps (__m512i vindex, __mmask16 mask, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dps vm32y {k}
vgatherpf1dps vm32y {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC
Description
Prefetch single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache using writemask k (elements are brought into cache only when their corresponding mask bits are set). scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
Operation
FOR j := 0 to 15
i := j*16;
IF mask[j] THEN
Prefetch([base_addr + SignExtend(vindex[i*31:i]) * scale], hint, RFO=0);
FI
ENDFOR;
vgatherpf0dps, vgatherpf1dps
void _mm512_prefetch_i32gather_ps (__m512i index, void const* mv, int scale, int hint)
Synopsis
void _mm512_prefetch_i32gather_ps (__m512i index, void const* mv, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0dps m512 {k}
vgatherpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC
Description
Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location mv at packed 32-bit integer indices stored in index scaled by scale. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
Operation
cachev := 0
FOR j := 0 to 15
i := j*32
addr := MEM[mv + index[j] * scale]
cachev[i+31:i] := addr[i+63:i]
ENDFOR
vscatterpf0dpd, vscatterpf1dpd
void _mm512_mask_prefetch_i32scatter_pd (void* base_addr, __mmask8 mask, __m256i vinde, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i32scatter_pd (void* base_addr, __mmask8 mask, __m256i vinde, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dpd vm32y {k}
vscatterpf1dpd vm32y {k}
CPUID Flags: AVX512PF
Description
Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not brought into cache when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 TO 7
i := j*32;
IF mask[j] THEN
Prefetch(base_addr + SignExtend(vindex[i+31:i]) * scale], Level=hint, RFO=1);
FI
ENDFOR;
vscatterpf0dpd, vscatterpf1dpd
void _mm512_prefetch_i32scatter_pd (void* base_addr, __m256i vindex, int scale, int hint)
Synopsis
void _mm512_prefetch_i32scatter_pd (void* base_addr, __m256i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dpd vm32y {k}
vscatterpf1dpd vm32y {k}
CPUID Flags: AVX512PF
Description
Prefetch double-precision (64-bit) floating-point elements with intent to write using 32-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 TO 7
i := j*32;
Prefetch(base_addr + SignExtend(vindex[i+31:i]) * scale], Level=hint, RFO=1);
ENDFOR;
vscatterpf0dps, vscatterpf1dps
void _mm512_mask_prefetch_i32scatter_ps (void* mv, __mmask16 k, __m512i index, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i32scatter_ps (void* mv, __mmask16 k, __m512i index, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dps m512 {k}
vscatterpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC
Description
Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location mv at packed 32-bit integer indices stored in index scaled by scale. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. Only those elements whose corresponding mask bit in k is set are loaded into the desired cache.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
addr := MEM[mv + index[j] * scale]
CASE hint OF
_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
_MM_HINT_T2: PrefetchL2WithT1HintNonTemporal(addr[i+31:i])
_MM_HINT_NTA: PrefetchL1WithT0HintNonTemporal(addr[i+31:i])
FI
ENDFOR
vscatterpf0dps, vscatterpf1dps
void _mm512_prefetch_i32scatter_ps (void* mv, __m512i index, int scale, int hint)
Synopsis
void _mm512_prefetch_i32scatter_ps (void* mv, __m512i index, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0dps m512 {k}
vscatterpf1dps m512 {k}
CPUID Flags: AVX512PF for AVX-512, KNCNI for KNC
Description
Prefetches 16 single-precision (32-bit) floating-point elements in memory starting at location mv at packed 32-bit integer indices stored in index scaled by scale. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
Operation
FOR j := 0 to 15
i := j*32
addr := MEM[mv + index[j] * scale]
CASE hint OF
_MM_HINT_T0: PrefetchL1WithT0Hint(addr[i+31:i])
_MM_HINT_T1: PrefetchL2WithT1Hint(addr[i+31:i])
_MM_HINT_T2: PrefetchL2WithT1HintNonTemporal(addr[i+31:i])
_MM_HINT_NTA: PrefetchL1WithT0HintNonTemporal(addr[i+31:i])
ESAC
ENDFOR
vgatherpf0qpd, vgatherpf1qpd
void _mm512_mask_prefetch_i64gather_pd (__m512i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i64gather_pd (__m512i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0qpd vm32z {k}
vgatherpf1qpd vm32z {k}
CPUID Flags: AVX512PF
Description
Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by hint using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Prefetched elements are merged in cache using writemask k (elements are copied from memory when the corresponding mask bit is set). scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
Operation
FOR j := 0 to 7
i := j*64;
IF mask[j] THEN
Prefetch([base_addr + SignExtend(vindex[i*63:i] * scale]), Level=hint, RFO=0);
FI
ENDFOR;
vgatherpf0qpd, vgatherpf1qpd
void _mm512_prefetch_i64gather_pd (__m512i vindex, void const* base_addr, int scale, int hint)
Synopsis
void _mm512_prefetch_i64gather_pd (__m512i vindex, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0qpd vm32z {k}
vgatherpf1qpd vm32z {k}
CPUID Flags: AVX512PF
Description
Prefetch double-precision (64-bit) floating-point elements from memory into cache level specified by hint using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
Operation
FOR j := 0 to 7
i := j*64;
Prefetch([base_addr + SignExtend(vindex[i*63:i] * scale]), Level=hint, RFO=0);
ENDFOR;
vgatherpf0qps, vgatherpf1qps
void _mm512_mask_prefetch_i64gather_ps (__m512i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i64gather_ps (__m512i vindex, __mmask8 mask, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0qps vm64z {k}
vgatherpf1qps vm64z {k}
CPUID Flags: AVX512PF
Description
Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache using writemask k (elements are only brought into cache when their corresponding mask bit is set). scale should be 1, 2, 4 or 8.. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
Operation
FOR j:= 0 to 7
i := j*64;
IF mask[j] THEN
Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], hint, RFO=0);
FI
ENDFOR;
vgatherpf0qps, vgatherpf1qps
void _mm512_prefetch_i64gather_ps (__m512i vindex, void const* base_addr, int scale, int hint)
Synopsis
void _mm512_prefetch_i64gather_ps (__m512i vindex, void const* base_addr, int scale, int hint)
#include "immintrin.h"
Instruction: vgatherpf0qps vm64z {k}
vgatherpf1qps vm64z {k}
CPUID Flags: AVX512PF
Description
Prefetch single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged in cache. scale should be 1, 2, 4 or 8. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache.
Operation
FOR j:= 0 to 7
i := j*64;
Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], hint, RFO=0);
ENDFOR;
vscatterpf0qpd, vscatterpf1qpd
void _mm512_mask_prefetch_i64scatter_pd (void* base_addr, __mmask8 mask, __m512i vindex, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i64scatter_pd (void* base_addr, __mmask8 mask, __m512i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0qpd vm32z {k}
vscatterpf1qpd vm32z {k}
CPUID Flags: AVX512PF
Description
Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not brought into cache when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64;
IF mask[j] THEN
Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1);
FI
ENDFOR;
vscatterpf0qpd, vscatterpf1qpd
void _mm512_prefetch_i64scatter_pd (void* base_addr, __m512i vindex, int scale, int hint)
Synopsis
void _mm512_prefetch_i64scatter_pd (void* base_addr, __m512i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0qpd vm32z {k}
vscatterpf1qpd vm32z {k}
CPUID Flags: AVX512PF
Description
Prefetch double-precision (64-bit) floating-point elements with intent to write into memory using 64-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 64-bit elements are brought into cache from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64;
Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1);
ENDFOR;
vscatterpf0qps, vscatterpf1qps
void _mm512_mask_prefetch_i64scatter_ps (void* base_addr, __mmask8 mask, __m512i vindex, int scale, int hint)
Synopsis
void _mm512_mask_prefetch_i64scatter_ps (void* base_addr, __mmask8 mask, __m512i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0qps vm64z {k}
vscatterpf1qps vm64z {k}
CPUID Flags: AVX512PF
Description
Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. The hint parameter may be 1 (_MM_HINT_T0) for prefetching to L1 cache, or 2 (_MM_HINT_T1) for prefetching to L2 cache. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not brought into cache when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64;
IF mask[j] THEN
Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1);
FI
ENDFOR;
vscatterpf0qps, vscatterpf1qps
void _mm512_prefetch_i64scatter_ps (void* base_addr, __m512i vindex, int scale, int hint)
Synopsis
void _mm512_prefetch_i64scatter_ps (void* base_addr, __m512i vindex, int scale, int hint)
#include "immintrin.h"
Instruction: vscatterpf0qps vm64z {k}
vscatterpf1qps vm64z {k}
CPUID Flags: AVX512PF
Description
Prefetch single-precision (32-bit) floating-point elements with intent to write into memory using 64-bit indices. Elements are prefetched into cache level hint, where hint is 0 or 1. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
Operation
FOR j := 0 to 7
i := j*64;
Prefetch([base_addr + SignExtend(vindex[i+63:i]) * scale], Level=hint, RFO=1);
ENDFOR;
psadbw
__m64 _m_psadbw (__m64 a, __m64 b)
Synopsis
__m64 _m_psadbw (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: psadbw mm, mm
CPUID Flags: SSE
Description
Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of dst.
Operation
FOR j := 0 to 7
i := j*8
tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
dst[63:16] := 0
pshufw
__m64 _m_pshufw (__m64 a, int imm8)
Synopsis
__m64 _m_pshufw (__m64 a, int imm8)
#include "xmmintrin.h"
Instruction: pshufw mm, mm, imm
CPUID Flags: SSE
Description
Shuffle 16-bit integers in a using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[15:0] := src[15:0]
1: tmp[15:0] := src[31:16]
2: tmp[15:0] := src[47:32]
3: tmp[15:0] := src[63:48]
ESAC
RETURN tmp[15:0]
}
dst[15:0] := SELECT4(a[63:0], imm8[1:0])
dst[31:16] := SELECT4(a[63:0], imm8[3:2])
dst[47:32] := SELECT4(a[63:0], imm8[5:4])
dst[63:48] := SELECT4(a[63:0], imm8[7:6])
vrangepd
__m128d _mm_mask_range_pd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)
Synopsis
__m128d _mm_mask_range_pd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vrangepd
__m128d _mm_maskz_range_pd (__mmask8 k, __m128d a, __m128d b, int imm8)
Synopsis
__m128d _mm_maskz_range_pd (__mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vrangepd
__m128d _mm_range_pd (__m128d a, __m128d b, int imm8)
Synopsis
__m128d _mm_range_pd (__m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 1
i := j*64
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:128] := 0
vrangepd
__m256d _mm256_mask_range_pd (__m256d src, __mmask8 k, __m256d a, __m256d b, int imm8)
Synopsis
__m256d _mm256_mask_range_pd (__m256d src, __mmask8 k, __m256d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vrangepd
__m256d _mm256_maskz_range_pd (__mmask8 k, __m256d a, __m256d b, int imm8)
Synopsis
__m256d _mm256_maskz_range_pd (__mmask8 k, __m256d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vrangepd
__m256d _mm256_range_pd (__m256d a, __m256d b, int imm8)
Synopsis
__m256d _mm256_range_pd (__m256d a, __m256d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 3
i := j*64
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:256] := 0
vrangepd
__m512d _mm512_mask_range_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int imm8)
Synopsis
__m512d _mm512_mask_range_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vrangepd
__m512d _mm512_maskz_range_pd (__mmask8 k, __m512d a, __m512d b, int imm8)
Synopsis
__m512d _mm512_maskz_range_pd (__mmask8 k, __m512d a, __m512d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrangepd
__m512d _mm512_range_pd (__m512d a, __m512d b, int imm8)
Synopsis
__m512d _mm512_range_pd (__m512d a, __m512d b, int imm8)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:512] := 0
vrangeps
__m128 _mm_mask_range_ps (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)
Synopsis
__m128 _mm_mask_range_ps (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vrangeps
__m128 _mm_maskz_range_ps (__mmask8 k, __m128 a, __m128 b, int imm8)
Synopsis
__m128 _mm_maskz_range_ps (__mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vrangeps
__m128 _mm_range_ps (__m128 a, __m128 b, int imm8)
Synopsis
__m128 _mm_range_ps (__m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 3
i := j*32
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:128] := 0
vrangeps
__m256 _mm256_mask_range_ps (__m256 src, __mmask8 k, __m256 a, __m256 b, int imm8)
Synopsis
__m256 _mm256_mask_range_ps (__m256 src, __mmask8 k, __m256 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vrangeps
__m256 _mm256_maskz_range_ps (__mmask8 k, __m256 a, __m256 b, int imm8)
Synopsis
__m256 _mm256_maskz_range_ps (__mmask8 k, __m256 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vrangeps
__m256 _mm256_range_ps (__m256 a, __m256 b, int imm8)
Synopsis
__m256 _mm256_range_ps (__m256 a, __m256 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512VL + AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 7
i := j*32
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:256] := 0
vrangeps
__m512 _mm512_mask_range_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int imm8)
Synopsis
__m512 _mm512_mask_range_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vrangeps
__m512 _mm512_maskz_range_ps (__mmask16 k, __m512 a, __m512 b, int imm8)
Synopsis
__m512 _mm512_maskz_range_ps (__mmask16 k, __m512 a, __m512 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrangeps
__m512 _mm512_range_ps (__m512 a, __m512 b, int imm8)
Synopsis
__m512 _mm512_range_ps (__m512 a, __m512 b, int imm8)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:512] := 0
vrangepd
__m512d _mm512_mask_range_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int imm8, int rounding)
Synopsis
__m512d _mm512_mask_range_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for packed double-precision (64-bit) floating-point elements in
a and
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vrangepd
__m512d _mm512_maskz_range_round_pd (__mmask8 k, __m512d a, __m512d b, int imm8, int rounding)
Synopsis
__m512d _mm512_maskz_range_round_pd (__mmask8 k, __m512d a, __m512d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for packed double-precision (64-bit) floating-point elements in
a and
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrangepd
__m512d _mm512_range_round_pd (__m512d a, __m512d b, int imm8, int rounding)
Synopsis
__m512d _mm512_range_round_pd (__m512d a, __m512d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangepd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for packed double-precision (64-bit) floating-point elements in
a and
b, and store the results in
dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := RANGE(a[i+63:i], b[i+63:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:512] := 0
vrangeps
__m512 _mm512_mask_range_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int imm8, int rounding)
Synopsis
__m512 _mm512_mask_range_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for packed single-precision (32-bit) floating-point elements in
a and
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vrangeps
__m512 _mm512_maskz_range_round_ps (__mmask16 k, __m512 a, __m512 b, int imm8, int rounding)
Synopsis
__m512 _mm512_maskz_range_round_ps (__mmask16 k, __m512 a, __m512 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for packed single-precision (32-bit) floating-point elements in
a and
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrangeps
__m512 _mm512_range_round_ps (__m512 a, __m512 b, int imm8, int rounding)
Synopsis
__m512 _mm512_range_round_ps (__m512 a, __m512 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangeps
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for packed single-precision (32-bit) floating-point elements in
a and
b, and store the results in
dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[63:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := RANGE(a[i+31:i], b[i+31:i], imm8[1:0], imm8[3:2])
ENDFOR
dst[MAX:512] := 0
vrangesd
__m128d _mm_mask_range_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
Synopsis
__m128d _mm_mask_range_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for the lower double-precision (64-bit) floating-point element in
a and
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
IF k[0]
dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrangesd
__m128d _mm_maskz_range_round_sd (__mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
Synopsis
__m128d _mm_maskz_range_round_sd (__mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for the lower double-precision (64-bit) floating-point element in
a and
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
IF k[0]
dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrangesd
__m128d _mm_range_round_sd (__m128d a, __m128d b, int imm8, int rounding)
Synopsis
__m128d _mm_range_round_sd (__m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for the lower double-precision (64-bit) floating-point element in
a and
b, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrangess
__m128 _mm_mask_range_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
Synopsis
__m128 _mm_mask_range_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for the lower single-precision (32-bit) floating-point element in
a and
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[31:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
IF k[0]
dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrangess
__m128 _mm_maskz_range_round_ss (__mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
Synopsis
__m128 _mm_maskz_range_round_ss (__mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for the lower single-precision (32-bit) floating-point element in
a and
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[31:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
IF k[0]
dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrangess
__m128 _mm_range_round_ss (__m128 a, __m128 b, int imm8, int rounding)
Synopsis
__m128 _mm_range_round_ss (__m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in
imm8) for the lower single-precision (32-bit) floating-point element in
a and
b, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[31:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrangesd
__m128d _mm_mask_range_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)
Synopsis
__m128d _mm_mask_range_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
IF k[0]
dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrangesd
__m128d _mm_maskz_range_sd (__mmask8 k, __m128d a, __m128d b, int imm8)
Synopsis
__m128d _mm_maskz_range_sd (__mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vrangesd
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[63:0], src2[63:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src1[63:0] : src2[63:0]
1: tmp[63:0] := (src1[63:0] <= src2[63:0]) ? src2[63:0] : src1[63:0]
2: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src1[63:0] : src2[63:0]
3: tmp[63:0] := (ABS(src1[63:0]) <= ABS(src2[63:0])) ? src2[63:0] : src1[63:0]
ESAC
CASE signSelCtl[1:0]
0: dst[63:0] := (src1[63] << 63) OR (tmp[62:0])
1: dst[63:0] := tmp[63:0]
2: dst[63:0] := (0 << 63) OR (tmp[62:0])
3: dst[63:0] := (1 << 63) OR (tmp[62:0])
ESAC
RETURN dst
}
IF k[0]
dst[63:0]] := RANGE(a[63:0], b[63:0], imm8[1:0], imm8[3:2])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrangess
__m128 _mm_mask_range_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)
Synopsis
__m128 _mm_mask_range_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[31:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
IF k[0]
dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrangess
__m128 _mm_maskz_range_ss (__mmask8 k, __m128 a, __m128 b, int imm8)
Synopsis
__m128 _mm_maskz_range_ss (__mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vrangess
CPUID Flags: AVX512DQ
Description
Calculate the max, min, absolute max, or absolute min (depending on control in imm8) for the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
imm8[1:0] specifies the operation control: 00 = min, 01 = max, 10 = absolute max, 11 = absolute min.
imm8[3:2] specifies the sign control: 00 = sign from a, 01 = sign from compare result, 10 = clear sign bit, 11 = set sign bit.
Operation
RANGE(src1[31:0], src2[31:0], opCtl[1:0], signSelCtl[1:0])
{
CASE opCtl[1:0]
0: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src1[31:0] : src2[31:0]
1: tmp[31:0] := (src1[31:0] <= src2[31:0]) ? src2[31:0] : src1[31:0]
2: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src1[31:0] : src2[31:0]
3: tmp[31:0] := (ABS(src1[31:0]) <= ABS(src2[31:0])) ? src2[31:0] : src1[31:0]
ESAC
CASE signSelCtl[1:0]
0: dst[31:0] := (src1[31] << 31) OR (tmp[30:0])
1: dst[31:0] := tmp[31:0]
2: dst[31:0] := (0 << 31) OR (tmp[30:0])
3: dst[31:0] := (1 << 31) OR (tmp[30:0])
ESAC
RETURN dst
}
IF k[0]
dst[31:0]] := RANGE(a[31:0], b[31:0], imm8[1:0], imm8[3:2])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
rcpps
__m128 _mm_rcp_ps (__m128 a)
Synopsis
__m128 _mm_rcp_ps (__m128 a)
#include "xmmintrin.h"
Instruction: rcpps xmm, xmm
CPUID Flags: SSE
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 1.5*2^-12.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
Performance
vrcpps
__m256 _mm256_rcp_ps (__m256 a)
Synopsis
__m256 _mm256_rcp_ps (__m256 a)
#include "immintrin.h"
Instruction: vrcpps ymm, ymm
CPUID Flags: AVX
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 1.5*2^-12.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
rcpss
__m128 _mm_rcp_ss (__m128 a)
Synopsis
__m128 _mm_rcp_ss (__m128 a)
#include "xmmintrin.h"
Instruction: rcpss xmm, xmm
CPUID Flags: SSE
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 1.5*2^-12.
Operation
dst[31:0] := APPROXIMATE(1.0/a[31:0])
dst[127:32] := a[127:32]
Performance
vrcp14pd
__m128d _mm_mask_rcp14_pd (__m128d src, __mmask8 k, __m128d a)
Synopsis
__m128d _mm_mask_rcp14_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vrcp14pd
__m128d _mm_maskz_rcp14_pd (__mmask8 k, __m128d a)
Synopsis
__m128d _mm_maskz_rcp14_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vrcp14pd
__m128d _mm_rcp14_pd (__m128d a)
Synopsis
__m128d _mm_rcp14_pd (__m128d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ENDFOR
dst[MAX:128] := 0
vrcp14pd
__m256d _mm256_mask_rcp14_pd (__m256d src, __mmask8 k, __m256d a)
Synopsis
__m256d _mm256_mask_rcp14_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vrcp14pd
__m256d _mm256_maskz_rcp14_pd (__mmask8 k, __m256d a)
Synopsis
__m256d _mm256_maskz_rcp14_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vrcp14pd
__m256d _mm256_rcp14_pd (__m256d a)
Synopsis
__m256d _mm256_rcp14_pd (__m256d a)
#include "immintrin.h"
Instruction: vrcp14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ENDFOR
dst[MAX:256] := 0
vrcp14pd
__m512d _mm512_mask_rcp14_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_rcp14_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrcp14pd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vrcp14pd
__m512d _mm512_maskz_rcp14_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_rcp14_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrcp14pd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrcp14pd
__m512d _mm512_rcp14_pd (__m512d a)
Synopsis
__m512d _mm512_rcp14_pd (__m512d a)
#include "immintrin.h"
Instruction: vrcp14pd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := APPROXIMATE(1.0/a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vrcp14ps
__m128 _mm_mask_rcp14_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_rcp14_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vrcp14ps
__m128 _mm_maskz_rcp14_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_rcp14_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vrcp14ps
__m128 _mm_rcp14_ps (__m128 a)
Synopsis
__m128 _mm_rcp14_ps (__m128 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:128] := 0
vrcp14ps
__m256 _mm256_mask_rcp14_ps (__m256 src, __mmask8 k, __m256 a)
Synopsis
__m256 _mm256_mask_rcp14_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vrcp14ps
__m256 _mm256_maskz_rcp14_ps (__mmask8 k, __m256 a)
Synopsis
__m256 _mm256_maskz_rcp14_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vrcp14ps
__m256 _mm256_rcp14_ps (__m256 a)
Synopsis
__m256 _mm256_rcp14_ps (__m256 a)
#include "immintrin.h"
Instruction: vrcp14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:256] := 0
vrcp14ps
__m512 _mm512_mask_rcp14_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_rcp14_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp14ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vrcp14ps
__m512 _mm512_maskz_rcp14_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_rcp14_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp14ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrcp14ps
__m512 _mm512_rcp14_ps (__m512 a)
Synopsis
__m512 _mm512_rcp14_ps (__m512 a)
#include "immintrin.h"
Instruction: vrcp14ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vrcp14sd
__m128d _mm_mask_rcp14_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_rcp14_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
IF k[0]
dst[63:0] := APPROXIMATE(1.0/b[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrcp14sd
__m128d _mm_maskz_rcp14_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_rcp14_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
IF k[0]
dst[63:0] := APPROXIMATE(1.0/b[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrcp14sd
__m128d _mm_rcp14_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_rcp14_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
dst[63:0] := APPROXIMATE(1.0/b[63:0])
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrcp14ss
__m128 _mm_mask_rcp14_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_rcp14_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
IF k[0]
dst[31:0] := APPROXIMATE(1.0/b[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrcp14ss
__m128 _mm_maskz_rcp14_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_rcp14_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
IF k[0]
dst[31:0] := APPROXIMATE(1.0/b[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrcp14ss
__m128 _mm_rcp14_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_rcp14_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
dst[31:0] := APPROXIMATE(1.0/b[31:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrcp23ps
__m512 _mm512_mask_rcp23_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_rcp23_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp23ps zmm {k}, m512
CPUID Flags: KNCNI
Description
Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in a to 23 bits of precision, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vrcp23ps
__m512 _mm512_rcp23_ps (__m512 a)
Synopsis
__m512 _mm512_rcp23_ps (__m512 a)
#include "immintrin.h"
Instruction: vrcp23ps zmm {k}, m512
CPUID Flags: KNCNI
Description
Approximates the reciprocals of packed single-precision (32-bit) floating-point elements in a to 23 bits of precision, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vrcp28pd
__m512d _mm512_mask_rcp28_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_rcp28_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
ELSE
dst[i+63:i] := src[i+63:i];
FI
ENDFOR;
vrcp28pd
__m512d _mm512_maskz_rcp28_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_rcp28_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
ELSE
dst[i+63:i] := 0;
FI
ENDFOR;
vrcp28pd
__m512d _mm512_rcp28_pd (__m512d a)
Synopsis
__m512d _mm512_rcp28_pd (__m512d a)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 7
i := j*64;
dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
ENDFOR;
vrcp28ps
__m512 _mm512_mask_rcp28_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_rcp28_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
ELSE
dst[i+31:i] := src[i+31:i];
FI
ENDFOR;
vrcp28ps
__m512 _mm512_maskz_rcp28_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_rcp28_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
ELSE
dst[i+31:i] := 0;
FI
ENDFOR;
vrcp28ps
__m512 _mm512_rcp28_ps (__m512 a)
Synopsis
__m512 _mm512_rcp28_ps (__m512 a)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 15
i := j*32;
dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
ENDFOR;
vrcp28pd
__m512d _mm512_mask_rcp28_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
Synopsis
__m512d _mm512_mask_rcp28_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
ELSE
dst[i+63:i] := src[i+63:i];
FI
ENDFOR;
vrcp28pd
__m512d _mm512_maskz_rcp28_round_pd (__mmask8 k, __m512d a, int rounding)
Synopsis
__m512d _mm512_maskz_rcp28_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
ELSE
dst[i+63:i] := 0;
FI
ENDFOR;
vrcp28pd
__m512d _mm512_rcp28_round_pd (__m512d a, int rounding)
Synopsis
__m512d _mm512_rcp28_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vrcp28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64;
dst[i+63:i] := RCP_28_SP(1.0/a[i+63:i];
ENDFOR;
vrcp28ps
__m512 _mm512_mask_rcp28_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_mask_rcp28_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
ELSE
dst[i+31:i] := src[i+31:i];
FI
ENDFOR;
vrcp28ps
__m512 _mm512_maskz_rcp28_round_ps (__mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_maskz_rcp28_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
ELSE
dst[i+31:i] := 0;
FI
ENDFOR;
vrcp28ps
__m512 _mm512_rcp28_round_ps (__m512 a, int rounding)
Synopsis
__m512 _mm512_rcp28_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vrcp28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32;
dst[i+31:i] := RCP_28_SP(1.0/a[i+31:i];
ENDFOR;
vrcp28sd
__m128d _mm_mask_rcp28_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_rcp28_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0] THEN
dst[63:0] := RCP_28_DP(1.0/b[63:0];
ELSE
dst[63:0] := src[63:0];
FI
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_maskz_rcp28_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_rcp28_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0] THEN
dst[63:0] := RCP_28_DP(1.0/b[63:0];
ELSE
dst[63:0] := 0;
FI
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_rcp28_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_rcp28_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in
b, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := RCP_28_DP(1.0/b[63:0];
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_mask_rcp28_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_rcp28_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0] THEN
dst[31:0] := RCP_28_DP(1.0/b[31:0];
ELSE
dst[31:0] := src[31:0];
FI
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_maskz_rcp28_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_rcp28_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0] THEN
dst[31:0] := RCP_28_DP(1.0/b[31:0];
ELSE
dst[31:0] := 0;
FI
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_rcp28_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_rcp28_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in
b, store the result in the lower element of
dst. The maximum relative error for this approximation is less than 2^-28, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := RCP_28_DP(1.0/b[31:0];
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_mask_rcp28_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_rcp28_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
IF k[0] THEN
dst[63:0] := RCP_28_DP(1.0/b[63:0];
ELSE
dst[63:0] := src[63:0];
FI
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_maskz_rcp28_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_rcp28_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
IF k[0] THEN
dst[63:0] := RCP_28_DP(1.0/b[63:0];
ELSE
dst[63:0] := 0;
FI
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrcp28sd
__m128d _mm_rcp28_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_rcp28_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrcp28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
dst[63:0] := RCP_28_DP(1.0/b[63:0];
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_mask_rcp28_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_rcp28_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
IF k[0] THEN
dst[31:0] := RCP_28_DP(1.0/b[31:0];
ELSE
dst[31:0] := src[31:0];
FI
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_maskz_rcp28_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_rcp28_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
IF k[0] THEN
dst[31:0] := RCP_28_DP(1.0/b[31:0];
ELSE
dst[31:0] := 0;
FI
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrcp28ss
__m128 _mm_rcp28_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_rcp28_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrcp28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
dst[31:0] := RCP_28_DP(1.0/b[31:0];
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
rdpmc
__int64 _rdpmc (int a)
Synopsis
__int64 _rdpmc (int a)
#include "immintrin.h"
Instruction: rdpmc
Description
Read the Performance Monitor Counter (PMC) specified by a, and store up to 64-bits in dst. The width of performance counters is implementation specific.
Operation
dst[63:0] := ReadPMC(a)
rdrand
int _rdrand16_step (unsigned short* val)
Synopsis
int _rdrand16_step (unsigned short* val)
#include "immintrin.h"
Instruction: rdrand r16
CPUID Flags: RDRAND
Description
Read a hardware generated 16-bit random value and store the result in val. Return 1 if a random value was generated, and 0 otherwise.
Operation
IF HW_RND_GEN.ready = 1
val[15:0] := HW_RND_GEN.data;
RETURN 1;
ELSE
val[15:0] := 0;
RETURN 0;
FI
rdrand
int _rdrand32_step (unsigned int* val)
Synopsis
int _rdrand32_step (unsigned int* val)
#include "immintrin.h"
Instruction: rdrand r32
CPUID Flags: RDRAND
Description
Read a hardware generated 32-bit random value and store the result in val. Return 1 if a random value was generated, and 0 otherwise.
Operation
IF HW_RND_GEN.ready = 1
val[31:0] := HW_RND_GEN.data;
RETURN 1;
ELSE
val[31:0] := 0;
RETURN 0;
FI
rdrand
int _rdrand64_step (unsigned __int64* val)
Synopsis
int _rdrand64_step (unsigned __int64* val)
#include "immintrin.h"
Instruction: rdrand r64
CPUID Flags: RDRAND
Description
Read a hardware generated 64-bit random value and store the result in val. Return 1 if a random value was generated, and 0 otherwise.
Operation
IF HW_RND_GEN.ready = 1
val[63:0] := HW_RND_GEN.data;
RETURN 1;
ELSE
val[63:0] := 0;
RETURN 0;
FI
Performance
rdseed
int _rdseed16_step (unsigned short * val)
Synopsis
int _rdseed16_step (unsigned short * val)
#include "immintrin.h"
Instruction: rdseed r16
CPUID Flags: RDSEED
Description
Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store in val. Return 1 if a random value was generated, and 0 otherwise.
Operation
IF HW_NRND_GEN.ready = 1 THEN
val[15:0] := HW_NRND_GEN.data
RETURN 1
ELSE
val[15:0] := 0
RETURN 0
FI
rdseed
int _rdseed32_step (unsigned int * val)
Synopsis
int _rdseed32_step (unsigned int * val)
#include "immintrin.h"
Instruction: rdseed r32
CPUID Flags: RDSEED
Description
Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store in val. Return 1 if a random value was generated, and 0 otherwise.
Operation
IF HW_NRND_GEN.ready = 1 THEN
val[31:0] := HW_NRND_GEN.data
RETURN 1
ELSE
val[31:0] := 0
RETURN 0
FI
rdseed
int _rdseed64_step (unsigned __int64 * val)
Synopsis
int _rdseed64_step (unsigned __int64 * val)
#include "immintrin.h"
Instruction: rdseed r64
CPUID Flags: RDSEED
Description
Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store in val. Return 1 if a random value was generated, and 0 otherwise.
Operation
IF HW_NRND_GEN.ready = 1 THEN
val[63:0] := HW_NRND_GEN.data
RETURN 1
ELSE
val[63:0] := 0
RETURN 0
FI
rdtsc
__int64 _rdtsc (void)
Synopsis
__int64 _rdtsc (void)
#include "immintrin.h"
Instruction: rdtsc
CPUID Flags: TSC
Description
Copy the current 64-bit value of the processor's time-stamp counter into dst.
Operation
dst[63:0] := TimeStampCounter
rdtscp
unsigned __int64 __rdtscp (unsigned int * mem_addr)
Synopsis
unsigned __int64 __rdtscp (unsigned int * mem_addr)
#include "immintrin.h"
Instruction: rdtscp
CPUID Flags: RDTSCP
Description
Copy the current 64-bit value of the processor's time-stamp counter into dst, and store the IA32_TSC_AUX MSR (signature value) into memory at mem_addr.
Operation
dst[63:0] := TimeStampCounter
MEM[mem_addr+31:mem_addr] := IA32_TSC_AUX[31:0]
Performance
rdfsbase
unsigned int _readfsbase_u32 ()
Synopsis
unsigned int _readfsbase_u32 ()
#include "immintrin.h"
Instruction: rdfsbase r32
CPUID Flags: FSGSBASE
Description
Read the FS segment base register and store the 32-bit result in dst.
Operation
dst[31:0] := FS_Segment_Base_Register;
dst[63:32] := 0
rdfsbase
unsigned __int64 _readfsbase_u64 ()
Synopsis
unsigned __int64 _readfsbase_u64 ()
#include "immintrin.h"
Instruction: rdfsbase r64
CPUID Flags: FSGSBASE
Description
Read the FS segment base register and store the 64-bit result in dst.
Operation
dst[63:0] := FS_Segment_Base_Register;
rdgsbase
unsigned int _readgsbase_u32 ()
Synopsis
unsigned int _readgsbase_u32 ()
#include "immintrin.h"
Instruction: rdgsbase r32
CPUID Flags: FSGSBASE
Description
Read the GS segment base register and store the 32-bit result in dst.
Operation
dst[31:0] := GS_Segment_Base_Register;
dst[63:32] := 0
rdgsbase
unsigned __int64 _readgsbase_u64 ()
Synopsis
unsigned __int64 _readgsbase_u64 ()
#include "immintrin.h"
Instruction: rdgsbase r64
CPUID Flags: FSGSBASE
Description
Read the GS segment base register and store the 64-bit result in dst.
Operation
dst[63:0] := GS_Segment_Base_Register;
...
__m512d _mm512_mask_recip_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_recip_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Computes the reciprocal of packed double-precision (64-bit) floating-point elements in a, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := (1 / a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_recip_pd (__m512d a)
Synopsis
__m512d _mm512_recip_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Computes the reciprocal of packed double-precision (64-bit) floating-point elements in a, storing the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := (1 / a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_recip_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_recip_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Computes the reciprocal of packed single-precision (32-bit) floating-point elements in a, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := (1 / a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_recip_ps (__m512 a)
Synopsis
__m512 _mm512_recip_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Computes the reciprocal of packed single-precision (32-bit) floating-point elements in a, storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := (1 / a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
int _mm512_mask_reduce_add_epi32 (__mmask16 k, __m512i a)
Synopsis
int _mm512_mask_reduce_add_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
Operation
sum[31:0] := 0
FOR j := 0 to 15
i := j*32
IF k[j]
sum[31:0] := sum[31:0] + a[i+31:i]
FI
ENDFOR
RETURN sum[31:0]
...
int _mm512_reduce_add_epi32 (__m512i a)
Synopsis
int _mm512_reduce_add_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.
Operation
sum[31:0] := 0
FOR j := 0 to 15
i := j*32
sum[31:0] := sum[31:0] + a[i+31:i]
ENDFOR
RETURN sum[63:0]
...
__int64 _mm512_mask_reduce_add_epi64 (__mmask8 k, __m512i a)
Synopsis
__int64 _mm512_mask_reduce_add_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
Operation
sum[63:0] := 0
FOR j := 0 to 7
i := j*64
IF k[j]
sum[63:0] := sum[63:0] + a[i+63:i]
FI
ENDFOR
RETURN sum[63:0]
...
__int64 _mm512_reduce_add_epi64 (__m512i a)
Synopsis
__int64 _mm512_reduce_add_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
Operation
sum[63:0] := 0
FOR j := 0 to 7
i := j*64
sum[63:0] := sum[63:0] + a[i+63:i]
ENDFOR
RETURN sum[63:0]
...
double _mm512_mask_reduce_add_pd (__mmask8 k, __m512d a)
Synopsis
double _mm512_mask_reduce_add_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
Operation
sum[63:0] := 0
FOR j := 0 to 7
i := j*64
IF k[j]
sum[63:0] := sum[63:0] + a[i+63:i]
FI
ENDFOR
RETURN sum[63:0]
...
double _mm512_reduce_add_pd (__m512d a)
Synopsis
double _mm512_reduce_add_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
Operation
sum[63:0] := 0
FOR j := 0 to 7
i := j*64
sum[63:0] := sum[63:0] + a[i+63:i]
ENDFOR
RETURN sum[63:0]
...
float _mm512_mask_reduce_add_ps (__mmask16 k, __m512 a)
Synopsis
float _mm512_mask_reduce_add_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
Operation
sum[31:0] := 0
FOR j := 0 to 15
i := j*32
IF k[j]
sum[31:0] := sum[31:0] + a[i+31:i]
FI
ENDFOR
RETURN sum[31:0]
...
float _mm512_reduce_add_ps (__m512 a)
Synopsis
float _mm512_reduce_add_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
Operation
sum[31:0] := 0
FOR j := 0 to 15
i := j*32
sum[31:0] := sum[31:0] + a[i+31:i]
ENDFOR
RETURN sum[63:0]
...
int _mm512_mask_reduce_and_epi32 (__mmask16 k, __m512i a)
Synopsis
int _mm512_mask_reduce_and_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
Operation
reduced[31:0] := 0xFFFFFFFF
FOR j := 0 to 15
i := j*32
IF k[j]
reduced[31:0] := reduced[31:0] AND a[i+31:i]
FI
ENDFOR
RETURN reduced[31:0]
...
int _mm512_reduce_and_epi32 (__m512i a)
Synopsis
int _mm512_reduce_and_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
Operation
reduced[31:0] := 0xFFFFFFFF
FOR j := 0 to 15
i := j*32
reduced[31:0] := reduced[31:0] AND a[i+31:i]
ENDFOR
RETURN reduced[31:0]
...
__int64 _mm512_mask_reduce_and_epi64 (__mmask8 k, __m512i a)
Synopsis
__int64 _mm512_mask_reduce_and_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
Operation
reduced[63:0] := 0xFFFFFFFFFFFFFFFF
FOR j := 0 to 7
i := j*64
IF k[j]
reduced[63:0] := reduced[63:0] AND a[i+63:i]
FI
ENDFOR
RETURN reduced[63:0]
...
__int64 _mm512_reduce_and_epi64 (__m512i a)
Synopsis
__int64 _mm512_reduce_and_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
Operation
reduced[63:0] := 0xFFFFFFFFFFFFFFFF
FOR j := 0 to 7
i := j*64
reduced[63:0] := reduced[63:0] AND a[i+63:i]
ENDFOR
RETURN reduced[63:0]
...
double _mm512_mask_reduce_gmax_pd (__mmask8 k, __m512d a)
Synopsis
double _mm512_mask_reduce_gmax_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in a and stores the result in dst. Bitmask k is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).
Operation
max = a[63:0]
FOR j := 1 to 7
i := j*64
IF k[j]
CONTINUE
ELSE
dst = FpMax(max, a[i+63:i])
FI
ENDFOR
dst := max
...
double _mm512_reduce_gmax_pd (__m512d a)
Synopsis
double _mm512_reduce_gmax_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Determines the maximum element of the packed double-precision (64-bit) floating-point elements stored in a and stores the result in dst.
Operation
max = a[63:0]
FOR j := 1 to 7
i := j*64
dst = FpMax(max, a[i+63:i])
ENDFOR
dst := max
...
float _mm512_mask_reduce_gmax_ps (__mmask16 k, __m512 a)
Synopsis
float _mm512_mask_reduce_gmax_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in a and stores the result in dst. Bitmask k is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).
Operation
max = a[31:0]
FOR j := 1 to 15
i := j*32
IF k[j]
CONTINUE
ELSE
dst = FpMax(max, a[i+31:i])
FI
ENDFOR
dst := max
...
float _mm512_reduce_gmax_ps (__m512 a)
Synopsis
float _mm512_reduce_gmax_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Determines the maximum element of the packed single-precision (32-bit) floating-point elements stored in a and stores the result in dst.
Operation
max = a[31:0]
FOR j := 1 to 15
i := j*32
dst = FpMax(max, a[i+31:i])
ENDFOR
dst := max
...
double _mm512_mask_reduce_gmin_pd (__mmask8 k, __m512d a)
Synopsis
double _mm512_mask_reduce_gmin_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in a and stores the result in dst. Bitmask k is used to exclude certain elements (elements are ignored when the corresponding mask bit is not set).
Operation
min = a[63:0]
FOR j := 1 to 7
i := j*64
IF k[j]
CONTINUE
ELSE
dst = FpMin(min, a[i+63:i])
FI
ENDFOR
dst := min
...
double _mm512_reduce_gmin_pd (__m512d a)
Synopsis
double _mm512_reduce_gmin_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Determines the minimum element of the packed double-precision (64-bit) floating-point elements stored in a and stores the result in dst.
Operation
min = a[63:0]
FOR j := 1 to 7
i := j*64
dst = FpMin(min, a[i+63:i])
ENDFOR
dst := min
...
float _mm512_mask_reduce_gmin_ps (__mmask16 k, __m512 a)
Synopsis
float _mm512_mask_reduce_gmin_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in a and stores the result in dst using writemask k (elements are ignored when the corresponding mask bit is not set).
Operation
min = a[31:0]
FOR j := 1 to 15
i := j*32
IF k[j]
CONTINUE
ELSE
dst = FpMin(min, a[i+31:i])
FI
ENDFOR
dst := min
...
float _mm512_reduce_gmin_ps (__m512 a)
Synopsis
float _mm512_reduce_gmin_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Determines the minimum element of the packed single-precision (32-bit) floating-point elements stored in a and stores the result in dst.
Operation
min = a[31:0]
FOR j := 1 to 15
i := j*32
dst = FpMin(min, a[i+31:i])
ENDFOR
dst := min
...
int _mm512_mask_reduce_max_epi32 (__mmask16 k, __m512i a)
Synopsis
int _mm512_mask_reduce_max_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
Operation
max[31:0] := MIN_INT
FOR j := 0 to 15
i := j*32
IF k[j]
max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
FI
ENDFOR
RETURN max[31:0]
...
int _mm512_reduce_max_epi32 (__m512i a)
Synopsis
int _mm512_reduce_max_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by maximum. Returns the maximum of all elements in a.
Operation
max[31:0] := MIN_INT
FOR j := 0 to 15
i := j*32
max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
ENDFOR
RETURN max[31:0]
...
__int64 _mm512_mask_reduce_max_epi64 (__mmask8 k, __m512i a)
Synopsis
__int64 _mm512_mask_reduce_max_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
Operation
max[63:0] := MIN_INT
FOR j := 0 to 7
i := j*64
IF k[j]
max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
FI
ENDFOR
RETURN max[63:0]
...
__int64 _mm512_reduce_max_epi64 (__m512i a)
Synopsis
__int64 _mm512_reduce_max_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
Operation
max[63:0] := MIN_INT
FOR j := 0 to 7
i := j*64
max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
ENDFOR
RETURN max[63:0]
...
unsigned int _mm512_mask_reduce_max_epu32 (__mmask16 k, __m512i a)
Synopsis
unsigned int _mm512_mask_reduce_max_epu32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
Operation
max[31:0] := 0
FOR j := 0 to 15
i := j*32
IF k[j]
max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
FI
ENDFOR
RETURN max[31:0]
...
unsigned int _mm512_reduce_max_epu32 (__m512i a)
Synopsis
unsigned int _mm512_reduce_max_epu32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
Operation
max[31:0] := 0
FOR j := 0 to 15
i := j*32
max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
ENDFOR
RETURN max[31:0]
...
unsigned __int64 _mm512_mask_reduce_max_epu64 (__mmask8 k, __m512i a)
Synopsis
unsigned __int64 _mm512_mask_reduce_max_epu64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
Operation
max[63:0] := 0
FOR j := 0 to 7
i := j*64
IF k[j]
max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
FI
ENDFOR
RETURN max[63:0]
...
unsigned __int64 _mm512_reduce_max_epu64 (__m512i a)
Synopsis
unsigned __int64 _mm512_reduce_max_epu64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
Operation
max[63:0] := 0
FOR j := 0 to 7
i := j*64
max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
ENDFOR
RETURN max[63:0]
...
double _mm512_mask_reduce_max_pd (__mmask8 k, __m512d a)
Synopsis
double _mm512_mask_reduce_max_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
Operation
max[63:0] := MIN_DOUBLE
FOR j := 0 to 7
i := j*64
IF k[j]
max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
FI
ENDFOR
RETURN max[63:0]
...
double _mm512_reduce_max_pd (__m512d a)
Synopsis
double _mm512_reduce_max_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
Operation
max[63:0] := MIN_DOUBLE
FOR j := 0 to 7
i := j*64
max[63:0] := MAXIMUM(max[63:0], a[i+63:i])
ENDFOR
RETURN max[63:0]
...
float _mm512_mask_reduce_max_ps (__mmask16 k, __m512 a)
Synopsis
float _mm512_mask_reduce_max_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
Operation
max[31:0] := MIN_FLOAT
FOR j := 0 to 15
i := j*32
IF k[j]
max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
FI
ENDFOR
RETURN max[31:0]
...
float _mm512_reduce_max_ps (__m512 a)
Synopsis
float _mm512_reduce_max_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
Operation
max[31:0] := MIN_FLOAT
FOR j := 0 to 15
i := j*32
max[31:0] := MAXIMUM(max[31:0], a[i+31:i])
ENDFOR
RETURN max[31:0]
...
int _mm512_mask_reduce_min_epi32 (__mmask16 k, __m512i a)
Synopsis
int _mm512_mask_reduce_min_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
Operation
min[31:0] := MAX_INT
FOR j := 0 to 15
i := j*32
IF k[j]
min[31:0] := MINIMUM(min[31:0], a[i+31:i])
FI
ENDFOR
RETURN min[31:0]
...
int _mm512_reduce_min_epi32 (__m512i a)
Synopsis
int _mm512_reduce_min_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by minimum. Returns the minimum of all elements in a.
Operation
min[31:0] := MAX_INT
FOR j := 0 to 15
i := j*32
min[31:0] := MINIMUM(min[31:0], a[i+31:i])
ENDFOR
RETURN min[31:0]
...
__int64 _mm512_mask_reduce_min_epi64 (__mmask8 k, __m512i a)
Synopsis
__int64 _mm512_mask_reduce_min_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
Operation
min[63:0] := MAX_INT
FOR j := 0 to 7
i := j*64
IF k[j]
min[63:0] := MINIMUM(min[63:0], a[i+63:i])
FI
ENDFOR
RETURN min[63:0]
...
__int64 _mm512_reduce_min_epi64 (__m512i a)
Synopsis
__int64 _mm512_reduce_min_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
Operation
min[63:0] := MAX_INT
FOR j := 0 to 7
i := j*64
min[63:0] := MINIMUM(min[63:0], a[i+63:i])
ENDFOR
RETURN min[63:0]
...
unsigned int _mm512_mask_reduce_min_epu32 (__mmask16 k, __m512i a)
Synopsis
unsigned int _mm512_mask_reduce_min_epu32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
Operation
min[31:0] := MAX_UINT
FOR j := 0 to 15
i := j*32
IF k[j]
min[31:0] := MINIMUM(min[31:0], a[i+31:i])
FI
ENDFOR
RETURN min[31:0]
...
unsigned int _mm512_reduce_min_epu32 (__m512i a)
Synopsis
unsigned int _mm512_reduce_min_epu32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
Operation
min[31:0] := MAX_UINT
FOR j := 0 to 15
i := j*32
min[31:0] := MINIMUM(min[31:0], a[i+31:i])
ENDFOR
RETURN min[31:0]
...
unsigned __int64 _mm512_mask_reduce_min_epu64 (__mmask8 k, __m512i a)
Synopsis
unsigned __int64 _mm512_mask_reduce_min_epu64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed unsigned 64-bit integers in a by minimum using mask k. Returns the minimum of all active elements in a.
Operation
min[63:0] := MAX_UINT
FOR j := 0 to 7
i := j*64
IF k[j]
min[63:0] := MINIMUM(min[63:0], a[i+63:i])
FI
ENDFOR
RETURN min[63:0]
...
unsigned __int64 _mm512_reduce_min_epu64 (__m512i a)
Synopsis
unsigned __int64 _mm512_reduce_min_epu64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
Operation
min[63:0] := MAX_UINT
FOR j := 0 to 7
i := j*64
min[63:0] := MINIMUM(min[63:0], a[i+63:i])
ENDFOR
RETURN min[63:0]
...
double _mm512_mask_reduce_min_pd (__mmask8 k, __m512d a)
Synopsis
double _mm512_mask_reduce_min_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
Operation
min[63:0] := MAX_DOUBLE
FOR j := 0 to 7
i := j*64
IF k[j]
min[63:0] := MINIMUM(min[63:0], a[i+63:i])
FI
ENDFOR
RETURN min[63:0]
...
double _mm512_reduce_min_pd (__m512d a)
Synopsis
double _mm512_reduce_min_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
Operation
min[63:0] := MAX_DOUBLE
FOR j := 0 to 7
i := j*64
min[63:0] := MINIMUM(min[63:0], a[i+63:i])
ENDFOR
RETURN min[63:0]
...
float _mm512_mask_reduce_min_ps (__mmask16 k, __m512 a)
Synopsis
float _mm512_mask_reduce_min_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
Operation
min[31:0] := MAX_FLOAT
FOR j := 0 to 15
i := j*32
IF k[j]
min[31:0] := MINIMUM(min[31:0], a[i+31:i])
FI
ENDFOR
RETURN min[31:0]
...
float _mm512_reduce_min_ps (__m512 a)
Synopsis
float _mm512_reduce_min_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
Operation
min[31:0] := MAX_INT
FOR j := 0 to 15
i := j*32
min[31:0] := MINIMUM(min[31:0], a[i+31:i])
ENDFOR
RETURN min[31:0]
...
int _mm512_mask_reduce_mul_epi32 (__mmask16 k, __m512i a)
Synopsis
int _mm512_mask_reduce_mul_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
Operation
prod[31:0] := 1
FOR j := 0 to 15
i := j*32
IF k[j]
prod[31:0] := prod[31:0] * a[i+31:i]
FI
ENDFOR
RETURN prod[31:0]
...
int _mm512_reduce_mul_epi32 (__m512i a)
Synopsis
int _mm512_reduce_mul_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.
Operation
prod[31:0] := 1
FOR j := 0 to 15
i := j*32
prod[31:0] := prod[31:0] * a[i+31:i]
ENDFOR
RETURN prod[31:0]
...
__int64 _mm512_mask_reduce_mul_epi64 (__mmask8 k, __m512i a)
Synopsis
__int64 _mm512_mask_reduce_mul_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
Operation
prod[63:0] := 1
FOR j := 0 to 7
i := j*64
IF k[j]
prod[63:0] := prod[63:0] * a[i+63:i]
FI
ENDFOR
RETURN prod[63:0]
...
__int64 _mm512_reduce_mul_epi64 (__m512i a)
Synopsis
__int64 _mm512_reduce_mul_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
Operation
prod[63:0] := 1
FOR j := 0 to 7
i := j*64
prod[63:0] := prod[63:0] * a[i+63:i]
ENDFOR
RETURN prod[63:0]
...
double _mm512_mask_reduce_mul_pd (__mmask8 k, __m512d a)
Synopsis
double _mm512_mask_reduce_mul_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
Operation
prod[63:0] := 1
FOR j := 0 to 7
i := j*64
IF k[j]
prod[63:0] := prod[63:0] * a[i+63:i]
FI
ENDFOR
RETURN prod[63:0]
...
double _mm512_reduce_mul_pd (__m512d a)
Synopsis
double _mm512_reduce_mul_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
Operation
prod[63:0] := 1
FOR j := 0 to 7
i := j*64
prod[63:0] := prod[63:0] * a[i+63:i]
ENDFOR
RETURN prod[63:0]
...
float _mm512_mask_reduce_mul_ps (__mmask16 k, __m512 a)
Synopsis
float _mm512_mask_reduce_mul_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
Operation
prod[31:0] := 1
FOR j := 0 to 15
i := j*32
IF k[j]
prod[31:0] := prod[31:0] * a[i+31:i]
FI
ENDFOR
RETURN prod[31:0]
...
float _mm512_reduce_mul_ps (__m512 a)
Synopsis
float _mm512_reduce_mul_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
Operation
prod[31:0] := 1
FOR j := 0 to 15
i := j*32
prod[31:0] := prod[31:0] * a[i+31:i]
ENDFOR
RETURN prod[31:0]
...
int _mm512_mask_reduce_or_epi32 (__mmask16 k, __m512i a)
Synopsis
int _mm512_mask_reduce_or_epi32 (__mmask16 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
Operation
reduced[31:0] := 0
FOR j := 0 to 15
i := j*32
IF k[j]
reduced[31:0] := reduced[31:0] OR a[i+31:i]
FI
ENDFOR
RETURN reduced[31:0]
...
int _mm512_reduce_or_epi32 (__m512i a)
Synopsis
int _mm512_reduce_or_epi32 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
Operation
reduced[31:0] := 0
FOR j := 0 to 15
i := j*32
reduced[31:0] := reduced[31:0] OR a[i+31:i]
ENDFOR
RETURN reduced[31:0]
...
__int64 _mm512_mask_reduce_or_epi64 (__mmask8 k, __m512i a)
Synopsis
__int64 _mm512_mask_reduce_or_epi64 (__mmask8 k, __m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
Operation
reduced[63:0] := 0
FOR j := 0 to 7
i := j*64
IF k[j]
reduced[63:0] := reduced[63:0] OR a[i+63:i]
FI
ENDFOR
RETURN reduced[63:0]
...
__int64 _mm512_reduce_or_epi64 (__m512i a)
Synopsis
__int64 _mm512_reduce_or_epi64 (__m512i a)
#include "immintrin.h"
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
Operation
reduced[63:0] := 0
FOR j := 0 to 7
i := j*64
reduced[63:0] := reduced[63:0] OR a[i+63:i]
ENDFOR
RETURN reduced[63:0]
vreducepd
__m128d _mm_mask_reduce_pd (__m128d src, __mmask8 k, __m128d a, int imm8)
Synopsis
__m128d _mm_mask_reduce_pd (__m128d src, __mmask8 k, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vreducepd
__m128d _mm_maskz_reduce_pd (__mmask8 k, __m128d a, int imm8)
Synopsis
__m128d _mm_maskz_reduce_pd (__mmask8 k, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vreducepd
__m128d _mm_reduce_pd (__m128d a, int imm8)
Synopsis
__m128d _mm_reduce_pd (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vreducepd
__m256d _mm256_mask_reduce_pd (__m256d src, __mmask8 k, __m256d a, int imm8)
Synopsis
__m256d _mm256_mask_reduce_pd (__m256d src, __mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vreducepd
__m256d _mm256_maskz_reduce_pd (__mmask8 k, __m256d a, int imm8)
Synopsis
__m256d _mm256_maskz_reduce_pd (__mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vreducepd
__m256d _mm256_reduce_pd (__m256d a, int imm8)
Synopsis
__m256d _mm256_reduce_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vreducepd
__m512d _mm512_mask_reduce_pd (__m512d src, __mmask8 k, __m512d a, int imm8)
Synopsis
__m512d _mm512_mask_reduce_pd (__m512d src, __mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vreducepd
__m512d _mm512_maskz_reduce_pd (__mmask8 k, __m512d a, int imm8)
Synopsis
__m512d _mm512_maskz_reduce_pd (__mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vreducepd
__m512d _mm512_reduce_pd (__m512d a, int imm8)
Synopsis
__m512d _mm512_reduce_pd (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vreduceps
__m128 _mm_mask_reduce_ps (__m128 src, __mmask8 k, __m128 a, int imm8)
Synopsis
__m128 _mm_mask_reduce_ps (__m128 src, __mmask8 k, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vreduceps
__m128 _mm_maskz_reduce_ps (__mmask8 k, __m128 a, int imm8)
Synopsis
__m128 _mm_maskz_reduce_ps (__mmask8 k, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vreduceps
__m128 _mm_reduce_ps (__m128 a, int imm8)
Synopsis
__m128 _mm_reduce_ps (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vreduceps
__m256 _mm256_mask_reduce_ps (__m256 src, __mmask8 k, __m256 a, int imm8)
Synopsis
__m256 _mm256_mask_reduce_ps (__m256 src, __mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vreduceps
__m256 _mm256_maskz_reduce_ps (__mmask8 k, __m256 a, int imm8)
Synopsis
__m256 _mm256_maskz_reduce_ps (__mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vreduceps
__m256 _mm256_reduce_ps (__m256 a, int imm8)
Synopsis
__m256 _mm256_reduce_ps (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512VL + AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vreduceps
__m512 _mm512_mask_reduce_ps (__m512 src, __mmask16 k, __m512 a, int imm8)
Synopsis
__m512 _mm512_mask_reduce_ps (__m512 src, __mmask16 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vreduceps
__m512 _mm512_maskz_reduce_ps (__mmask16 k, __m512 a, int imm8)
Synopsis
__m512 _mm512_maskz_reduce_ps (__mmask16 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vreduceps
__m512 _mm512_reduce_ps (__m512 a, int imm8)
Synopsis
__m512 _mm512_reduce_ps (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in a by the number of bits specified by imm8, and store the results in dst.
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vreducepd
__m512d _mm512_mask_reduce_round_pd (__m512d src, __mmask8 k, __m512d a, int imm8, int rounding)
Synopsis
__m512d _mm512_mask_reduce_round_pd (__m512d src, __mmask8 k, __m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in
a by the number of bits specified by
imm8, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vreducepd
__m512d _mm512_maskz_reduce_round_pd (__mmask8 k, __m512d a, int imm8, int rounding)
Synopsis
__m512d _mm512_maskz_reduce_round_pd (__mmask8 k, __m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in
a by the number of bits specified by
imm8, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vreducepd
__m512d _mm512_reduce_round_pd (__m512d a, int imm8, int rounding)
Synopsis
__m512d _mm512_reduce_round_pd (__m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducepd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed double-precision (64-bit) floating-point elements in
a by the number of bits specified by
imm8, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ReduceArgumentPD(src[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vreduceps
__m512 _mm512_mask_reduce_round_ps (__m512 src, __mmask16 k, __m512 a, int imm8, int rounding)
Synopsis
__m512 _mm512_mask_reduce_round_ps (__m512 src, __mmask16 k, __m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in
a by the number of bits specified by
imm8, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vreduceps
__m512 _mm512_maskz_reduce_round_ps (__mmask16 k, __m512 a, int imm8, int rounding)
Synopsis
__m512 _mm512_maskz_reduce_round_ps (__mmask16 k, __m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in
a by the number of bits specified by
imm8, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vreduceps
__m512 _mm512_reduce_round_ps (__m512 a, int imm8, int rounding)
Synopsis
__m512 _mm512_reduce_round_ps (__m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreduceps
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of packed single-precision (32-bit) floating-point elements in
a by the number of bits specified by
imm8, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ReduceArgumentPS(src[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vreducesd
__m128d _mm_mask_reduce_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
Synopsis
__m128d _mm_mask_reduce_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower double-precision (64-bit) floating-point element in
a by the number of bits specified by
imm8, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
IF k[0]
dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vreducesd
__m128d _mm_maskz_reduce_round_sd (__mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
Synopsis
__m128d _mm_maskz_reduce_round_sd (__mmask8 k, __m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower double-precision (64-bit) floating-point element in
a by the number of bits specified by
imm8, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
IF k[0]
dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vreducesd
__m128d _mm_reduce_round_sd (__m128d a, __m128d b, int imm8, int rounding)
Synopsis
__m128d _mm_reduce_round_sd (__m128d a, __m128d b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower double-precision (64-bit) floating-point element in
a by the number of bits specified by
imm8, store the result in the lower element of
dst, and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vreducess
__m128 _mm_mask_reduce_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
Synopsis
__m128 _mm_mask_reduce_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower single-precision (32-bit) floating-point element in
a by the number of bits specified by
imm8, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
IF k[0]
dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:64] := b[127:32]
dst[MAX:128] := 0
vreducess
__m128 _mm_maskz_reduce_round_ss (__mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
Synopsis
__m128 _mm_maskz_reduce_round_ss (__mmask8 k, __m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower single-precision (32-bit) floating-point element in
a by the number of bits specified by
imm8, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
IF k[0]
dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
ELSE
dst[31:0] := 0
FI
dst[127:64] := b[127:32]
dst[MAX:128] := 0
vreducess
__m128 _mm_reduce_round_ss (__m128 a, __m128 b, int imm8, int rounding)
Synopsis
__m128 _mm_reduce_round_ss (__m128 a, __m128 b, int imm8, int rounding)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower single-precision (32-bit) floating-point element in
a by the number of bits specified by
imm8, store the result in the lower element of
dst, and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
dst[127:64] := b[127:32]
dst[MAX:128] := 0
vreducesd
__m128d _mm_mask_reduce_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)
Synopsis
__m128d _mm_mask_reduce_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower double-precision (64-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
IF k[0]
dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vreducesd
__m128d _mm_maskz_reduce_sd (__mmask8 k, __m128d a, __m128d b, int imm8)
Synopsis
__m128d _mm_maskz_reduce_sd (__mmask8 k, __m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower double-precision (64-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
IF k[0]
dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vreducesd
__m128d _mm_reduce_sd (__m128d a, __m128d b, int imm8)
Synopsis
__m128d _mm_reduce_sd (__m128d a, __m128d b, int imm8)
#include "immintrin.h"
Instruction: vreducesd
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower double-precision (64-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.
Operation
ReduceArgumentPD(src1[63:0], imm8[7:0])
{
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[63:0] := pow(2, -m) * ROUND(pow(2, m) * src1[63:0], spe, rc_source, rc)
tmp[63:0] := src1[63:0] - tmp[63:0]
RETURN tmp[63:0]
}
dst[63:0] := ReduceArgumentPD(a[63:0], imm8[7:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vreducess
__m128 _mm_mask_reduce_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)
Synopsis
__m128 _mm_mask_reduce_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower single-precision (32-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
IF k[0]
dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:64] := b[127:32]
dst[MAX:128] := 0
vreducess
__m128 _mm_maskz_reduce_ss (__mmask8 k, __m128 a, __m128 b, int imm8)
Synopsis
__m128 _mm_maskz_reduce_ss (__mmask8 k, __m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower single-precision (32-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
IF k[0]
dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
ELSE
dst[31:0] := 0
FI
dst[127:64] := b[127:32]
dst[MAX:128] := 0
vreducess
__m128 _mm_reduce_ss (__m128 a, __m128 b, int imm8)
Synopsis
__m128 _mm_reduce_ss (__m128 a, __m128 b, int imm8)
#include "immintrin.h"
Instruction: vreducess
CPUID Flags: AVX512DQ
Description
Extract the reduced argument of the lower single-precision (32-bit) floating-point element in a by the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
ReduceArgumentPS(src1[31:0], imm8[7:0])
{
IF src1[31:0] == NAN
RETURN (convert src1[31:0] to QNaN)
FI
m := imm8[7:4] // number of fraction bits after the binary point to be preserved
rc := imm8[1:0] // round control
rc_src := imm8[2] // round ccontrol source
spe := 0
tmp[31:0] := pow(2, -m)*ROUND(pow(2, m)*src1[31:0], spe, rc_source, rc)
tmp[31:0] := src1[31:0] - tmp[31:0]
RETURN tmp[31:0]
}
dst[31:0] := ReduceArgumentPS(a[31:0], imm8[7:0])
dst[127:64] := b[127:32]
dst[MAX:128] := 0
...
__m128i _mm_rem_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rem_epi16 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 16-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 7
i := 16*j
dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_rem_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rem_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 16-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 15
i := 16*j
dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_rem_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rem_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 16-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 31
i := 16*j
dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_rem_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rem_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_rem_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rem_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_mask_rem_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_rem_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_rem_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rem_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 32-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_rem_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rem_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 64-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 1
i := 64*j
dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_rem_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rem_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 64-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 3
i := 64*j
dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_rem_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rem_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 64-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 7
i := 64*j
dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_rem_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rem_epi8 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed 8-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 15
i := 8*j
dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_rem_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rem_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed 8-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 31
i := 8*j
dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_rem_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rem_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed 8-bit integers in a by packed elements in b, and store the remainders as packed 32-bit integers in dst.
Operation
FOR j := 0 to 63
i := 8*j
dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_rem_epu16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rem_epu16 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 16-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 7
i := 16*j
dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_rem_epu16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rem_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 16-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 15
i := 16*j
dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_rem_epu16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rem_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 16-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 31
i := 16*j
dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_rem_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rem_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_rem_epu32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rem_epu32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_mask_rem_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_rem_epu32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := 32*j
IF k[j]
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_rem_epu32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rem_epu32 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 15
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_rem_epu64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rem_epu64 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 64-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 1
i := 64*j
dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_rem_epu64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rem_epu64 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 64-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 3
i := 64*j
dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_rem_epu64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rem_epu64 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 64-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 7
i := 64*j
dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_rem_epu8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rem_epu8 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 8-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 15
i := 8*j
dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_rem_epu8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rem_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 8-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 31
i := 8*j
dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_rem_epu8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rem_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Divide packed unsigned 8-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 63
i := 8*j
dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_mask_rint_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_rint_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Rounds the packed double-precision (64-bit) floating-point elements in a to the nearest even integer value and stores the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RoundToNearestEven(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_rint_pd (__m512d a)
Synopsis
__m512d _mm512_rint_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Rounds the packed double-precision (64-bit) floating-point elements in a to the nearest even integer value and stores the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := RoundToNearestEven(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_mask_rint_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_rint_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Rounds the packed single-precision (32-bit) floating-point elements in a to the nearest even integer value and stores the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RoundToNearestEven(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_rint_ps (__m512 a)
Synopsis
__m512 _mm512_rint_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Rounds the packed single-precision (32-bit) floating-point elements in a to the nearest even integer value and stores the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := RoundToNearestEven(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vprold
__m128i _mm_mask_rol_epi32 (__m128i src, __mmask8 k, __m128i a, const int imm8)
Synopsis
__m128i _mm_mask_rol_epi32 (__m128i src, __mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vprold
__m128i _mm_maskz_rol_epi32 (__mmask8 k, __m128i a, const int imm8)
Synopsis
__m128i _mm_maskz_rol_epi32 (__mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vprold
__m128i _mm_rol_epi32 (__m128i a, int imm8)
Synopsis
__m128i _mm_rol_epi32 (__m128i a, int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
i := j*32
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vprold
__m256i _mm256_mask_rol_epi32 (__m256i src, __mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_mask_rol_epi32 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vprold
__m256i _mm256_maskz_rol_epi32 (__mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_maskz_rol_epi32 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vprold
__m256i _mm256_rol_epi32 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_rol_epi32 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprold
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
i := j*32
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vprold
__m512i _mm512_mask_rol_epi32 (__m512i src, __mmask16 k, __m512i a, const int imm8)
Synopsis
__m512i _mm512_mask_rol_epi32 (__m512i src, __mmask16 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprold zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vprold
__m512i _mm512_maskz_rol_epi32 (__mmask16 k, __m512i a, const int imm8)
Synopsis
__m512i _mm512_maskz_rol_epi32 (__mmask16 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprold zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vprold
__m512i _mm512_rol_epi32 (__m512i a, const int imm8)
Synopsis
__m512i _mm512_rol_epi32 (__m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprold zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vprolq
__m128i _mm_mask_rol_epi64 (__m128i src, __mmask8 k, __m128i a, const int imm8)
Synopsis
__m128i _mm_mask_rol_epi64 (__m128i src, __mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vprolq
__m128i _mm_maskz_rol_epi64 (__mmask8 k, __m128i a, const int imm8)
Synopsis
__m128i _mm_maskz_rol_epi64 (__mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vprolq
__m128i _mm_rol_epi64 (__m128i a, const int imm8)
Synopsis
__m128i _mm_rol_epi64 (__m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
i := j*64
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vprolq
__m256i _mm256_mask_rol_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_mask_rol_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vprolq
__m256i _mm256_maskz_rol_epi64 (__mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_maskz_rol_epi64 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vprolq
__m256i _mm256_rol_epi64 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_rol_epi64 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
i := j*64
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vprolq
__m512i _mm512_mask_rol_epi64 (__m512i src, __mmask8 k, __m512i a, const int imm8)
Synopsis
__m512i _mm512_mask_rol_epi64 (__m512i src, __mmask8 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vprolq
__m512i _mm512_maskz_rol_epi64 (__mmask8 k, __m512i a, const int imm8)
Synopsis
__m512i _mm512_maskz_rol_epi64 (__mmask8 k, __m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vprolq
__m512i _mm512_rol_epi64 (__m512i a, const int imm8)
Synopsis
__m512i _mm512_rol_epi64 (__m512i a, const int imm8)
#include "immintrin.h"
Instruction: vprolq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vprolvd
__m128i _mm_mask_rolv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_rolv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vprolvd
__m128i _mm_maskz_rolv_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_rolv_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vprolvd
__m128i _mm_rolv_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rolv_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 3
i := j*32
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:128] := 0
vprolvd
__m256i _mm256_mask_rolv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_rolv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vprolvd
__m256i _mm256_maskz_rolv_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_rolv_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vprolvd
__m256i _mm256_rolv_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rolv_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 7
i := j*32
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0
vprolvd
__m512i _mm512_mask_rolv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_rolv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vprolvd
__m512i _mm512_maskz_rolv_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_rolv_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vprolvd
__m512i _mm512_rolv_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rolv_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
LEFT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src << count) OR (src >> (32 - count))
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := LEFT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vprolvq
__m128i _mm_mask_rolv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_rolv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vprolvq
__m128i _mm_maskz_rolv_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_rolv_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vprolvq
__m128i _mm_rolv_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rolv_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 1
i := j*64
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0
vprolvq
__m256i _mm256_mask_rolv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_rolv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vprolvq
__m256i _mm256_maskz_rolv_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_rolv_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vprolvq
__m256i _mm256_rolv_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rolv_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprolvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 3
i := j*64
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0
vprolvq
__m512i _mm512_mask_rolv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_rolv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vprolvq
__m512i _mm512_maskz_rolv_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_rolv_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vprolvq
__m512i _mm512_rolv_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rolv_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprolvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
LEFT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src << count) OR (src >> (64 - count))
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := LEFT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
vprord
__m128i _mm_mask_ror_epi32 (__m128i src, __mmask8 k, __m128i a, const int imm8)
Synopsis
__m128i _mm_mask_ror_epi32 (__m128i src, __mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vprord
__m128i _mm_maskz_ror_epi32 (__mmask8 k, __m128i a, const int imm8)
Synopsis
__m128i _mm_maskz_ror_epi32 (__mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vprord
__m128i _mm_ror_epi32 (__m128i a, const int imm8)
Synopsis
__m128i _mm_ror_epi32 (__m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
i := j*32
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vprord
__m256i _mm256_mask_ror_epi32 (__m256i src, __mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_mask_ror_epi32 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vprord
__m256i _mm256_maskz_ror_epi32 (__mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_maskz_ror_epi32 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vprord
__m256i _mm256_ror_epi32 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_ror_epi32 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprord
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
i := j*32
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vprord
__m512i _mm512_mask_ror_epi32 (__m512i src, __mmask16 k, __m512i a, int imm8)
Synopsis
__m512i _mm512_mask_ror_epi32 (__m512i src, __mmask16 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vprord zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vprord
__m512i _mm512_maskz_ror_epi32 (__mmask16 k, __m512i a, int imm8)
Synopsis
__m512i _mm512_maskz_ror_epi32 (__mmask16 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vprord zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vprord
__m512i _mm512_ror_epi32 (__m512i a, int imm8)
Synopsis
__m512i _mm512_ror_epi32 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vprord zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vprorq
__m128i _mm_mask_ror_epi64 (__m128i src, __mmask8 k, __m128i a, const int imm8)
Synopsis
__m128i _mm_mask_ror_epi64 (__m128i src, __mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vprorq
__m128i _mm_maskz_ror_epi64 (__mmask8 k, __m128i a, const int imm8)
Synopsis
__m128i _mm_maskz_ror_epi64 (__mmask8 k, __m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vprorq
__m128i _mm_ror_epi64 (__m128i a, const int imm8)
Synopsis
__m128i _mm_ror_epi64 (__m128i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
i := j*64
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vprorq
__m256i _mm256_mask_ror_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_mask_ror_epi64 (__m256i src, __mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vprorq
__m256i _mm256_maskz_ror_epi64 (__mmask8 k, __m256i a, const int imm8)
Synopsis
__m256i _mm256_maskz_ror_epi64 (__mmask8 k, __m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vprorq
__m256i _mm256_ror_epi64 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_ror_epi64 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vprorq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
i := j*64
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vprorq
__m512i _mm512_mask_ror_epi64 (__m512i src, __mmask8 k, __m512i a, int imm8)
Synopsis
__m512i _mm512_mask_ror_epi64 (__m512i src, __mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vprorq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vprorq
__m512i _mm512_maskz_ror_epi64 (__mmask8 k, __m512i a, int imm8)
Synopsis
__m512i _mm512_maskz_ror_epi64 (__mmask8 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vprorq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vprorq
__m512i _mm512_ror_epi64 (__m512i a, int imm8)
Synopsis
__m512i _mm512_ror_epi64 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vprorq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vprorvd
__m128i _mm_mask_rorv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_rorv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vprorvd
__m128i _mm_maskz_rorv_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_rorv_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vprorvd
__m128i _mm_rorv_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rorv_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 3
i := j*32
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:128] := 0
vprorvd
__m256i _mm256_mask_rorv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_rorv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vprorvd
__m256i _mm256_maskz_rorv_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_rorv_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vprorvd
__m256i _mm256_rorv_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rorv_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvd
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 7
i := j*32
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:256] := 0
vprorvd
__m512i _mm512_mask_rorv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_rorv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vprorvd
__m512i _mm512_maskz_rorv_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_rorv_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vprorvd
__m512i _mm512_rorv_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rorv_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
RIGHT_ROTATE_DWORDS(src, count_src){
count := count_src modulo 32
RETURN (src >>count) OR (src << (32 - count))
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := RIGHT_ROTATE_DWORDS(a[i+31:i], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vprorvq
__m128i _mm_mask_rorv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_rorv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vprorvq
__m128i _mm_maskz_rorv_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_rorv_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vprorvq
__m128i _mm_rorv_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_rorv_epi64 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 1
i := j*64
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:128] := 0
vprorvq
__m256i _mm256_mask_rorv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_rorv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vprorvq
__m256i _mm256_maskz_rorv_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_rorv_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vprorvq
__m256i _mm256_rorv_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_rorv_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vprorvq
CPUID Flags: AVX512VL + AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 3
i := j*64
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:256] := 0
vprorvq
__m512i _mm512_mask_rorv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_rorv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vprorvq
__m512i _mm512_maskz_rorv_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_rorv_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vprorvq
__m512i _mm512_rorv_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_rorv_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vprorvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
Operation
RIGHT_ROTATE_QWORDS(src, count_src){
count := count_src modulo 64
RETURN (src >> count) OR (src << (64 - count))
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := RIGHT_ROTATE_QWORDS(a[i+63:i], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
rol
unsigned int _rotl (unsigned int a, int shift)
Synopsis
unsigned int _rotl (unsigned int a, int shift)
#include "immintrin.h"
Instruction: rol r32, int
Description
Shift the bits of unsigned 32-bit integer a left by the number of bits specified in shift, rotating the most-significant bit to the least-significant bit location, and store the unsigned result in dst.
Operation
dst := a
count := shift BITWISE AND 31
DO WHILE (count > 0)
tmp[0] := dst[31]
dst := (dst << 1) OR tmp[0]
count := count - 1
OD
Performance
ror
unsigned int _rotr (unsigned int a, int shift)
Synopsis
unsigned int _rotr (unsigned int a, int shift)
#include "immintrin.h"
Instruction: ror r32, imm
Description
Shift the bits of unsigned 32-bit integer a right by the number of bits specified in shift, rotating the least-significant bit to the most-significant bit location, and store the unsigned result in dst.
Operation
dst := a
count := shift BITWISE AND 31
DO WHILE (count > 0)
tmp[31] := dst[0]
dst := (dst >> 1) OR tmp[31]
count := count - 1
OD
Performance
rol
unsigned short _rotwl (unsigned short a, int shift)
Synopsis
unsigned short _rotwl (unsigned short a, int shift)
#include "immintrin.h"
Instruction: rol r16, imm
Description
Shift the bits of unsigned 16-bit integer a left by the number of bits specified in shift, rotating the most-significant bit to the least-significant bit location, and store the unsigned result in dst.
Operation
dst := a
count := shift BITWISE AND 15
DO WHILE (count > 0)
tmp[0] := dst[15]
dst := (dst << 1) OR tmp[0]
count := count - 1
OD
Performance
ror
unsigned short _rotwr (unsigned short a, int shift)
Synopsis
unsigned short _rotwr (unsigned short a, int shift)
#include "immintrin.h"
Instruction: ror r16, imm
Description
Shift the bits of unsigned 16-bit integer a right by the number of bits specified in shift, rotating the least-significant bit to the most-significant bit location, and store the unsigned result in dst.
Operation
dst := a
count := shift BITWISE AND 15
DO WHILE (count > 0)
tmp[15] := dst[0]
dst := (dst >> 1) OR tmp[15]
count := count - 1
OD
Performance
roundpd
__m128d _mm_round_pd (__m128d a, int rounding)
Synopsis
__m128d _mm_round_pd (__m128d a, int rounding)
#include "smmintrin.h"
Instruction: roundpd xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the packed double-precision (64-bit) floating-point elements in
a using the
rounding parameter, and store the results as packed double-precision floating-point elements in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
Performance
vroundpd
__m256d _mm256_round_pd (__m256d a, int rounding)
Synopsis
__m256d _mm256_round_pd (__m256d a, int rounding)
#include "immintrin.h"
Instruction: vroundpd ymm, ymm, imm
CPUID Flags: AVX
Description
Round the packed double-precision (64-bit) floating-point elements in
a using the
rounding parameter, and store the results as packed double-precision floating-point elements in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
roundps
__m128 _mm_round_ps (__m128 a, int rounding)
Synopsis
__m128 _mm_round_ps (__m128 a, int rounding)
#include "smmintrin.h"
Instruction: roundps xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the packed single-precision (32-bit) floating-point elements in
a using the
rounding parameter, and store the results as packed single-precision floating-point elements in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ROUND(a[i+31:i])
ENDFOR
Performance
vroundps
__m256 _mm256_round_ps (__m256 a, int rounding)
Synopsis
__m256 _mm256_round_ps (__m256 a, int rounding)
#include "immintrin.h"
Instruction: vroundps ymm, ymm, imm
CPUID Flags: AVX
Description
Round the packed single-precision (32-bit) floating-point elements in
a using the
rounding parameter, and store the results as packed single-precision floating-point elements in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ROUND(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vroundps
__m512 _mm512_mask_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512 _mm512_mask_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vroundps zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Round the packed single-precision (32-bit) floating-point elements in
a to the nearest integer value using
expadj and in the direction of
rounding, and store the results as packed single-precision floating-point elements in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ROUND(a[i+31:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vroundps
__m512 _mm512_round_ps (__m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512 _mm512_round_ps (__m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vroundps zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Round the packed single-precision (32-bit) floating-point elements in
a to the nearest integer value using
expadj and in the direction of
rounding, and store the results as packed single-precision floating-point elements in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ROUND(a[i+31:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ENDFOR
dst[MAX:512] := 0
roundsd
__m128d _mm_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_round_sd (__m128d a, __m128d b, int rounding)
#include "smmintrin.h"
Instruction: roundsd xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the lower double-precision (64-bit) floating-point element in
b using the
rounding parameter, store the result as a double-precision floating-point element in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := ROUND(b[63:0])
dst[127:64] := a[127:64]
Performance
roundss
__m128 _mm_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_round_ss (__m128 a, __m128 b, int rounding)
#include "smmintrin.h"
Instruction: roundss xmm, xmm, imm
CPUID Flags: SSE4.1
Description
Round the lower single-precision (32-bit) floating-point element in
b using the
rounding parameter, store the result as a single-precision floating-point element in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := ROUND(b[31:0])
dst[127:32] := a[127:32]
Performance
vrndfxpntpd
__m512d _mm512_mask_roundfxpnt_adjust_pd (__m512d src, __mmask8 k, __m512d a, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512d _mm512_mask_roundfxpnt_adjust_pd (__m512d src, __mmask8 k, __m512d a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vrndfxpntpd zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in
a using
expadj and in the direction of
rounding and stores results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ROUND(a[i+63:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vrndfxpntpd
__m512d _mm512_roundfxpnt_adjust_pd (__m512d a, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512d _mm512_roundfxpnt_adjust_pd (__m512d a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vrndfxpntpd zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Performs element-by-element rounding of packed double-precision (64-bit) floating-point elements in
a using
expadj and in the direction of
rounding and stores results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ROUND(a[i+63:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ENDFOR
dst[MAX:512] := 0
vrndfxpntps
__m512 _mm512_mask_roundfxpnt_adjust_ps (__m512 src, __mmask16 k, __m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512 _mm512_mask_roundfxpnt_adjust_ps (__m512 src, __mmask16 k, __m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vrndfxpntps zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in
a using
expadj and in the direction of
rounding and stores results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ROUND(a[i+31:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vrndfxpntps
__m512 _mm512_roundfxpnt_adjust_ps (__m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
Synopsis
__m512 _mm512_roundfxpnt_adjust_ps (__m512 a, int rounding, _MM_EXP_ADJ_ENUM expadj)
#include "immintrin.h"
Instruction: vrndfxpntps zmm {k}, m512, imm
CPUID Flags: KNCNI
Description
Performs element-by-element rounding of packed single-precision (32-bit) floating-point elements in
a using
expadj and in the direction of
rounding and stores results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ROUND(a[i+31:i])
CASE expadj OF
_MM_EXPADJ_NONE: dst[i+31:i] = dst[i+31:i] * 2**0
_MM_EXPADJ_4: dst[i+31:i] = dst[i+31:i] * 2**4
_MM_EXPADJ_5: dst[i+31:i] = dst[i+31:i] * 2**5
_MM_EXPADJ_8: dst[i+31:i] = dst[i+31:i] * 2**8
_MM_EXPADJ_16: dst[i+31:i] = dst[i+31:i] * 2**16
_MM_EXPADJ_24: dst[i+31:i] = dst[i+31:i] * 2**24
_MM_EXPADJ_31: dst[i+31:i] = dst[i+31:i] * 2**31
_MM_EXPADJ_32: dst[i+31:i] = dst[i+31:i] * 2**32
ESAC
ENDFOR
dst[MAX:512] := 0
vrndscalepd
__m128d _mm_mask_roundscale_pd (__m128d src, __mmask8 k, __m128d a, int imm8)
Synopsis
__m128d _mm_mask_roundscale_pd (__m128d src, __mmask8 k, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vrndscalepd
__m128d _mm_maskz_roundscale_pd (__mmask8 k, __m128d a, int imm8)
Synopsis
__m128d _mm_maskz_roundscale_pd (__mmask8 k, __m128d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vrndscalepd
__m128d _mm_roundscale_pd (__m128d a, int imm8)
Synopsis
__m128d _mm_roundscale_pd (__m128d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 1
i := j*64
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vrndscalepd
__m256d _mm256_mask_roundscale_pd (__m256d src, __mmask8 k, __m256d a, int imm8)
Synopsis
__m256d _mm256_mask_roundscale_pd (__m256d src, __mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vrndscalepd
__m256d _mm256_maskz_roundscale_pd (__mmask8 k, __m256d a, int imm8)
Synopsis
__m256d _mm256_maskz_roundscale_pd (__mmask8 k, __m256d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vrndscalepd
__m256d _mm256_roundscale_pd (__m256d a, int imm8)
Synopsis
__m256d _mm256_roundscale_pd (__m256d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd
CPUID Flags: AVX512VL + AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 3
i := j*64
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vrndscalepd
__m512d _mm512_mask_roundscale_pd (__m512d src, __mmask8 k, __m512d a, int imm8)
Synopsis
__m512d _mm512_mask_roundscale_pd (__m512d src, __mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_maskz_roundscale_pd (__mmask8 k, __m512d a, int imm8)
Synopsis
__m512d _mm512_maskz_roundscale_pd (__mmask8 k, __m512d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_roundscale_pd (__m512d a, int imm8)
Synopsis
__m512d _mm512_roundscale_pd (__m512d a, int imm8)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vrndscaleps
__m128 _mm_mask_roundscale_ps (__m128 src, __mmask8 k, __m128 a, int imm8)
Synopsis
__m128 _mm_mask_roundscale_ps (__m128 src, __mmask8 k, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vrndscaleps
__m128 _mm_maskz_roundscale_ps (__mmask8 k, __m128 a, int imm8)
Synopsis
__m128 _mm_maskz_roundscale_ps (__mmask8 k, __m128 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vrndscaleps
__m128 _mm_roundscale_ps (__m128 a, int imm8)
Synopsis
__m128 _mm_roundscale_ps (__m128 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 3
i := j*32
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:128] := 0
vrndscaleps
__m256 _mm256_mask_roundscale_ps (__m256 src, __mmask8 k, __m256 a, int imm8)
Synopsis
__m256 _mm256_mask_roundscale_ps (__m256 src, __mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vrndscaleps
__m256 _mm256_maskz_roundscale_ps (__mmask8 k, __m256 a, int imm8)
Synopsis
__m256 _mm256_maskz_roundscale_ps (__mmask8 k, __m256 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vrndscaleps
__m256 _mm256_roundscale_ps (__m256 a, int imm8)
Synopsis
__m256 _mm256_roundscale_ps (__m256 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps
CPUID Flags: AVX512VL + AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 7
i := j*32
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:256] := 0
vrndscaleps
__m512 _mm512_mask_roundscale_ps (__m512 src, __mmask16 k, __m512 a, int imm8)
Synopsis
__m512 _mm512_mask_roundscale_ps (__m512 src, __mmask16 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_maskz_roundscale_ps (__mmask16 k, __m512 a, int imm8)
Synopsis
__m512 _mm512_maskz_roundscale_ps (__mmask16 k, __m512 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_roundscale_ps (__m512 a, int imm8)
Synopsis
__m512 _mm512_roundscale_ps (__m512 a, int imm8)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_mask_roundscale_round_pd (__m512d src, __mmask8 k, __m512d a, int imm8, int rounding)
Synopsis
__m512d _mm512_mask_roundscale_round_pd (__m512d src, __mmask8 k, __m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in
a to the number of fraction bits specified by
imm8, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_maskz_roundscale_round_pd (__mmask8 k, __m512d a, int imm8, int rounding)
Synopsis
__m512d _mm512_maskz_roundscale_round_pd (__mmask8 k, __m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in
a to the number of fraction bits specified by
imm8, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrndscalepd
__m512d _mm512_roundscale_round_pd (__m512d a, int imm8, int rounding)
Synopsis
__m512d _mm512_roundscale_round_pd (__m512d a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscalepd zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F
Description
Round packed double-precision (64-bit) floating-point elements in
a to the number of fraction bits specified by
imm8, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := RoundTo_IntegerPD(a[i+63:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_mask_roundscale_round_ps (__m512 src, __mmask16 k, __m512 a, int imm8, int rounding)
Synopsis
__m512 _mm512_mask_roundscale_round_ps (__m512 src, __mmask16 k, __m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in
a to the number of fraction bits specified by
imm8, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_maskz_roundscale_round_ps (__mmask16 k, __m512 a, int imm8, int rounding)
Synopsis
__m512 _mm512_maskz_roundscale_round_ps (__mmask16 k, __m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in
a to the number of fraction bits specified by
imm8, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrndscaleps
__m512 _mm512_roundscale_round_ps (__m512 a, int imm8, int rounding)
Synopsis
__m512 _mm512_roundscale_round_ps (__m512 a, int imm8, int rounding)
#include "immintrin.h"
Instruction: vrndscaleps zmm {k}, zmm, imm {er}
CPUID Flags: AVX512F
Description
Round packed single-precision (32-bit) floating-point elements in
a to the number of fraction bits specified by
imm8, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := RoundTo_IntegerPS(a[i+31:i], imm8[7:0])
ENDFOR
dst[MAX:512] := 0
vrndscalesd
__m128d _mm_mask_roundscale_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8, const int rounding)
Synopsis
__m128d _mm_mask_roundscale_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Round the lower double-precision (64-bit) floating-point element in
a to the number of fraction bits specified by
imm8, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
IF k[0]
dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vrndscalesd
__m128d _mm_maskz_roundscale_round_sd (__mmask8 k, __m128d a, __m128d b, const int imm8, const int rounding)
Synopsis
__m128d _mm_maskz_roundscale_round_sd (__mmask8 k, __m128d a, __m128d b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Round the lower double-precision (64-bit) floating-point element in
a to the number of fraction bits specified by
imm8, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
IF k[0]
dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vrndscalesd
__m128d _mm_roundscale_round_sd (__m128d a, __m128d b, const int imm8, const int rounding)
Synopsis
__m128d _mm_roundscale_round_sd (__m128d a, __m128d b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Round the lower double-precision (64-bit) floating-point element in
a to the number of fraction bits specified by
imm8, store the result in the lower element of
dst, and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vrndscaless
__m128 _mm_mask_roundscale_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8, const int rounding)
Synopsis
__m128 _mm_mask_roundscale_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Round the lower single-precision (32-bit) floating-point element in
a to the number of fraction bits specified by
imm8, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
IF k[0]
dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vrndscaless
__m128 _mm_maskz_roundscale_round_ss (__mmask8 k, __m128 a, __m128 b, const int imm8, const int rounding)
Synopsis
__m128 _mm_maskz_roundscale_round_ss (__mmask8 k, __m128 a, __m128 b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Round the lower single-precision (32-bit) floating-point element in
a to the number of fraction bits specified by
imm8, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
IF k[0]
dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vrndscaless
__m128 _mm_roundscale_round_ss (__m128 a, __m128 b, const int imm8, const int rounding)
Synopsis
__m128 _mm_roundscale_round_ss (__m128 a, __m128 b, const int imm8, const int rounding)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm {er}
CPUID Flags: AVX512F
Description
Round the lower single-precision (32-bit) floating-point element in
a to the number of fraction bits specified by
imm8, store the result in the lower element of
dst, and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vrndscalesd
__m128d _mm_mask_roundscale_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8)
Synopsis
__m128d _mm_mask_roundscale_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Round the lower double-precision (64-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
IF k[0]
dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vrndscalesd
__m128d _mm_maskz_roundscale_sd (__mmask8 k, __m128d a, __m128d b, const int imm8)
Synopsis
__m128d _mm_maskz_roundscale_sd (__mmask8 k, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Round the lower double-precision (64-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
IF k[0]
dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vrndscalesd
__m128d _mm_roundscale_sd (__m128d a, __m128d b, const int imm8)
Synopsis
__m128d _mm_roundscale_sd (__m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vrndscalesd xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Round the lower double-precision (64-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.
Operation
RoundTo_IntegerPD(src[63:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[63:0] := round_to_nearest_even_integer(2^M * src[63:0])
1: tmp[63:0] := round_to_equal_or_smaller_integer(2^M * src[63:0])
2: tmp[63:0] := round_to_equal_or_larger_integer(2^M * src[63:0])
3: tmp[63:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[63:0])
ESAC
dst[63:0] := 2^-M * tmp[63:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[63:0] != dst[63:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[63:0]
}
dst[63:0] := RoundTo_IntegerPD(a[63:0], imm8[7:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vrndscaless
__m128 _mm_mask_roundscale_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_mask_roundscale_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Round the lower single-precision (32-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
IF k[0]
dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vrndscaless
__m128 _mm_maskz_roundscale_ss (__mmask8 k, __m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_maskz_roundscale_ss (__mmask8 k, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Round the lower single-precision (32-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
IF k[0]
dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vrndscaless
__m128 _mm_roundscale_ss (__m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_roundscale_ss (__m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vrndscaless xmm {k}, xmm, xmm, imm
CPUID Flags: AVX512F
Description
Round the lower single-precision (32-bit) floating-point element in a to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
RoundTo_IntegerPS(src[31:0], imm8[7:0]){
IF(imm8[2] == 1)
rounding_direction := MXCSR.RC //Use the rounding mode specified by MXCSR.RC
ELSE
rounding_direction := imm8[1:0] //Use the rounding mode specified by imm8[1:0]
FI
M := imm8[7:4] // The scaling factor (number of fraction bits to round to)
CASE(rounding_direction)
0: tmp[31:0] := round_to_nearest_even_integer(2^M * src[31:0])
1: tmp[31:0] := round_to_equal_or_smaller_integer(2^M * src[31:0])
2: tmp[31:0] := round_to_equal_or_larger_integer(2^M * src[31:0])
3: tmp[31:0] := round_to_nearest_smallest_magnitude_integer(2^M * src[31:0])
ESAC
dst[31:0] := 2^-M * tmp[31:0] // scale back down
IF imm8[3] == 0 //check SPE
IF src[31:0] != dst[31:0] //check if precision has been lost
set_precision() //set #PE
FI
FI
RETURN dst[31:0]
}
dst[31:0] := RoundTo_IntegerPS(a[31:0], imm8[7:0])
dst[127:32] := b[127:32]
dst[MAX:128] := 0
rsqrtps
__m128 _mm_rsqrt_ps (__m128 a)
Synopsis
__m128 _mm_rsqrt_ps (__m128 a)
#include "xmmintrin.h"
Instruction: rsqrtps xmm, xmm
CPUID Flags: SSE
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 1.5*2^-12.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ENDFOR
Performance
vrsqrtps
__m256 _mm256_rsqrt_ps (__m256 a)
Synopsis
__m256 _mm256_rsqrt_ps (__m256 a)
#include "immintrin.h"
Instruction: vrsqrtps ymm, ymm
CPUID Flags: AVX
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 1.5*2^-12.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ENDFOR
dst[MAX:256] := 0
Performance
rsqrtss
__m128 _mm_rsqrt_ss (__m128 a)
Synopsis
__m128 _mm_rsqrt_ss (__m128 a)
#include "xmmintrin.h"
Instruction: rsqrtss xmm, xmm
CPUID Flags: SSE
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 1.5*2^-12.
Operation
dst[31:0] := APPROXIMATE(1.0 / SQRT(a[31:0]))
dst[127:32] := a[127:32]
Performance
vrsqrt14pd
__m128d _mm_mask_rsqrt14_pd (__m128d src, __mmask8 k, __m128d a)
Synopsis
__m128d _mm_mask_rsqrt14_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vrsqrt14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vrsqrt14pd
__m128d _mm_maskz_rsqrt14_pd (__mmask8 k, __m128d a)
Synopsis
__m128d _mm_maskz_rsqrt14_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vrsqrt14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vrsqrt14pd
__m256d _mm256_mask_rsqrt14_pd (__m256d src, __mmask8 k, __m256d a)
Synopsis
__m256d _mm256_mask_rsqrt14_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vrsqrt14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vrsqrt14pd
__m256d _mm256_maskz_rsqrt14_pd (__mmask8 k, __m256d a)
Synopsis
__m256d _mm256_maskz_rsqrt14_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vrsqrt14pd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vrsqrt14pd
__m512d _mm512_mask_rsqrt14_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_rsqrt14_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrsqrt14pd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vrsqrt14pd
__m512d _mm512_maskz_rsqrt14_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_rsqrt14_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrsqrt14pd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrsqrt14pd
__m512d _mm512_rsqrt14_pd (__m512d a)
Synopsis
__m512d _mm512_rsqrt14_pd (__m512d a)
#include "immintrin.h"
Instruction: vrsqrt14pd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := APPROXIMATE(1.0 / SQRT(a[i+63:i]))
ENDFOR
dst[MAX:512] := 0
vrsqrt14ps
__m128 _mm_mask_rsqrt14_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_rsqrt14_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vrsqrt14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vrsqrt14ps
__m128 _mm_maskz_rsqrt14_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_rsqrt14_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vrsqrt14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vrsqrt14ps
__m256 _mm256_mask_rsqrt14_ps (__m256 src, __mmask8 k, __m256 a)
Synopsis
__m256 _mm256_mask_rsqrt14_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vrsqrt14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vrsqrt14ps
__m256 _mm256_maskz_rsqrt14_ps (__mmask8 k, __m256 a)
Synopsis
__m256 _mm256_maskz_rsqrt14_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vrsqrt14ps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vrsqrt14ps
__m512 _mm512_mask_rsqrt14_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_rsqrt14_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt14ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vrsqrt14ps
__m512 _mm512_maskz_rsqrt14_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_rsqrt14_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt14ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vrsqrt14ps
__m512 _mm512_rsqrt14_ps (__m512 a)
Synopsis
__m512 _mm512_rsqrt14_ps (__m512 a)
#include "immintrin.h"
Instruction: vrsqrt14ps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ENDFOR
dst[MAX:512] := 0
vrsqrt14sd
__m128d _mm_mask_rsqrt14_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_rsqrt14_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
IF k[0]
dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrsqrt14sd
__m128d _mm_maskz_rsqrt14_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_rsqrt14_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
IF k[0]
dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrsqrt14sd
__m128d _mm_rsqrt14_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_rsqrt14_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt14sd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
dst[63:0] := APPROXIMATE(1.0 / SQRT(b[63:0]))
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vrsqrt14ss
__m128 _mm_mask_rsqrt14_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_rsqrt14_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
IF k[0]
dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrsqrt14ss
__m128 _mm_maskz_rsqrt14_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_rsqrt14_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
IF k[0]
dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrsqrt14ss
__m128 _mm_rsqrt14_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_rsqrt14_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt14ss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
Operation
dst[31:0] := APPROXIMATE(1.0 / SQRT(b[31:0]))
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vrsqrt23ps
__m512 _mm512_mask_rsqrt23_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_rsqrt23_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt23ps zmm {k}, m512
CPUID Flags: KNCNI
Description
Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in a to 23 bits of accuracy and stores the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := Sqrt(1.0 / a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vrsqrt23ps
__m512 _mm512_rsqrt23_ps (__m512 a)
Synopsis
__m512 _mm512_rsqrt23_ps (__m512 a)
#include "immintrin.h"
Instruction: vrsqrt23ps zmm {k}, m512
CPUID Flags: KNCNI
Description
Calculates the reciprocal square root of packed single-precision (32-bit) floating-point elements in a to 23 bits of accuracy and stores the result in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := Sqrt(1.0 / a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vrsqrt28pd
__m512d _mm512_mask_rsqrt28_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_rsqrt28_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
ELSE
dst[i+63:i] := src[i+63:i];
FI
ENDFOR;
vrsqrt28pd
__m512d _mm512_maskz_rsqrt28_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_rsqrt28_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
ELSE
dst[i+63:i] := 0;
FI
ENDFOR;
vrsqrt28pd
__m512d _mm512_rsqrt28_pd (__m512d a)
Synopsis
__m512d _mm512_rsqrt28_pd (__m512d a)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, store the results in dst. The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 7
i := j*64;
dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
ENDFOR;
vrsqrt28ps
__m512 _mm512_mask_rsqrt28_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_rsqrt28_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
ELSE
dst[i+31:i] := src[i+31:i];
FI
ENDFOR;
vrsqrt28ps
__m512 _mm512_maskz_rsqrt28_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_rsqrt28_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
ELSE
dst[i+31:i] := 0;
FI
ENDFOR;
vrsqrt28ps
__m512 _mm512_rsqrt28_ps (__m512 a)
Synopsis
__m512 _mm512_rsqrt28_ps (__m512 a)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, store the results in dst. The maximum relative error for this approximation is less than 2^-28.
Operation
FOR j := 0 to 15
i := j*32;
dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
ENDFOR;
vrsqrt28pd
__m512d _mm512_mask_rsqrt28_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
Synopsis
__m512d _mm512_mask_rsqrt28_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in
a, store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
ELSE
dst[i+63:i] := src[i+63:i];
FI
ENDFOR;
vrsqrt28pd
__m512d _mm512_maskz_rsqrt28_round_pd (__mmask8 k, __m512d a, int rounding)
Synopsis
__m512d _mm512_maskz_rsqrt28_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in
a, store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64;
IF k[j] THEN
dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
ELSE
dst[i+63:i] := 0;
FI
ENDFOR;
vrsqrt28pd
__m512d _mm512_rsqrt28_round_pd (__m512d a, int rounding)
Synopsis
__m512d _mm512_rsqrt28_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28pd zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in
a, store the results in
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64;
dst[i+63:i] := (1.0/SQRT(a[i+63:i]));
ENDFOR;
vrsqrt28ps
__m512 _mm512_mask_rsqrt28_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_mask_rsqrt28_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in
a, store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
ELSE
dst[i+31:i] := src[i+31:i];
FI
ENDFOR;
vrsqrt28ps
__m512 _mm512_maskz_rsqrt28_round_ps (__mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_maskz_rsqrt28_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in
a, store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32;
IF k[j] THEN
dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
ELSE
dst[i+31:i] := 0;
FI
ENDFOR;
vrsqrt28ps
__m512 _mm512_rsqrt28_round_ps (__m512 a, int rounding)
Synopsis
__m512 _mm512_rsqrt28_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ps zmm {k}, zmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in
a, store the results in
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32;
dst[i+31:i] := (1.0/SQRT(a[i+31:i]));
ENDFOR;
vrsqrt28sd
__m128d _mm_mask_rsqrt28_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_rsqrt28_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0] THEN
dst[63:0] := (1.0/SQRT(b[63:0]));
ELSE
dst[63:0] := src[63:0];
FI
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_maskz_rsqrt28_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_rsqrt28_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0] THEN
dst[63:0] := (1.0/SQRT(b[63:0]));
ELSE
dst[63:0] := 0;
FI
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_rsqrt28_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_rsqrt28_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in
b, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := (1.0/SQRT(b[63:0]));
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_mask_rsqrt28_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_rsqrt28_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0] THEN
dst[31:0] := (1.0/SQRT(b[31:0]));
ELSE
dst[31:0] := src[31:0];
FI
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_maskz_rsqrt28_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_rsqrt28_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0] THEN
dst[31:0] := (1.0/SQRT(b[31:0]));
ELSE
dst[31:0] := 0;
FI
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_rsqrt28_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_rsqrt28_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in
b, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst. The maximum relative error for this approximation is less than 2^-28.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := (1.0/SQRT(b[31:0]));
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_mask_rsqrt28_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_rsqrt28_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
IF k[0] THEN
dst[63:0] := (1.0/SQRT(b[63:0]));
ELSE
dst[63:0] := src[63:0];
FI
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_maskz_rsqrt28_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_rsqrt28_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
IF k[0] THEN
dst[63:0] := (1.0/SQRT(b[63:0]));
ELSE
dst[63:0] := 0;
FI
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrsqrt28sd
__m128d _mm_rsqrt28_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_rsqrt28_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vrsqrt28sd xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
dst[63:0] := (1.0/SQRT(b[63:0]));
dst[127:64] := a[127:64];
dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_mask_rsqrt28_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_rsqrt28_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
IF k[0] THEN
dst[31:0] := (1.0/SQRT(b[31:0]));
ELSE
dst[31:0] := src[31:0];
FI
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_maskz_rsqrt28_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_rsqrt28_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
IF k[0] THEN
dst[31:0] := (1.0/SQRT(b[31:0]));
ELSE
dst[31:0] := 0;
FI
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
vrsqrt28ss
__m128 _mm_rsqrt28_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_rsqrt28_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vrsqrt28ss xmm {k}, xmm, xmm
CPUID Flags: AVX512ER
Description
Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-28.
Operation
dst[31:0] := (1.0/SQRT(b[31:0]));
dst[127:32] := a[127:32];
dst[MAX:128] := 0;
psadbw
__m128i _mm_sad_epu8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sad_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psadbw xmm, xmm
CPUID Flags: SSE2
Description
Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce two unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.
Operation
FOR j := 0 to 15
i := j*8
tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
FOR j := 0 to 1
i := j*64
dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] +
tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
dst[i+63:i+16] := 0
ENDFOR
Performance
vpsadbw
__m256i _mm256_sad_epu8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_sad_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsadbw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.
Operation
FOR j := 0 to 31
i := j*8
tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
FOR j := 0 to 4
i := j*64
dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
dst[i+63:i+16] := 0
ENDFOR
dst[MAX:256] := 0
Performance
vpsadbw
__m512i _mm512_sad_epu8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_sad_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsadbw
CPUID Flags: AVX512BW
Description
Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in dst.
Operation
FOR j := 0 to 63
i := j*8
tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
FOR j := 0 to 7
i := j*64
dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
dst[i+63:i+16] := 0
ENDFOR
dst[MAX:512] := 0
psadbw
__m64 _mm_sad_pu8 (__m64 a, __m64 b)
Synopsis
__m64 _mm_sad_pu8 (__m64 a, __m64 b)
#include "xmmintrin.h"
Instruction: psadbw mm, mm
CPUID Flags: SSE
Description
Compute the absolute differences of packed unsigned 8-bit integers in a and b, then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of dst.
Operation
FOR j := 0 to 7
i := j*8
tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
dst[63:16] := 0
vpsbbd
__m512i _mm512_mask_sbb_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * borrow)
Synopsis
__m512i _mm512_mask_sbb_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsbbd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element three-input subtraction of packed 32-bit integer elements of v3 as well as the corresponding bit from k2 from v2. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are stored in dst using writemask k1 (elements are copied from v2 when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k2[j]
borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k2[j])
ELSE
dst[i+31:i] := v2[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsbbd
__m512i _mm512_sbb_epi32 (__m512i v2, __mmask16 k, __m512i v3, __mmask16 * borrow)
Synopsis
__m512i _mm512_sbb_epi32 (__m512i v2, __mmask16 k, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsbbd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element three-input subtraction of packed 32-bit integer elements of v3 as well as the corresponding bit from k from v2. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are stored in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v2[i+31:i] - v3[i+31:i] - k[j]
borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
ENDFOR
dst[MAX:512] := 0
vpsbbrd
__m512i _mm512_mask_sbbr_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * borrow)
Synopsis
__m512i _mm512_mask_sbbr_epi32 (__m512i v2, __mmask16 k1, __mmask16 k2, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsbbrd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element three-input subtraction of packed 32-bit integer elements of v2 as well as the corresponding bit from k2 from v3. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are stored in dst using writemask k1 (elements are copied from v2 when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k2[j]
borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
ELSE
dst[i+31:i] := v2[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsbbrd
__m512i _mm512_sbbr_epi32 (__m512i v2, __mmask16 k, __m512i v3, __mmask16 * borrow)
Synopsis
__m512i _mm512_sbbr_epi32 (__m512i v2, __mmask16 k, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsbbrd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element three-input subtraction of packed 32-bit integer elements of v2 as well as the corresponding bit from k from v3. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are stored in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v3[i+31:i] - v2[i+31:i] - k[j]
borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i] - k[j])
ENDFOR
dst[MAX:512] := 0
vscaleps
__m512 _mm512_mask_scale_ps (__m512 src, __mmask16 k, __m512 a, __m512i b)
Synopsis
__m512 _mm512_mask_scale_ps (__m512 src, __mmask16 k, __m512 a, __m512i b)
#include "immintrin.h"
Instruction: vscaleps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Scales each single-precision (32-bit) floating-point element in a by multiplying it by 2**exponent, where the exponenet is the corresponding 32-bit integer element in b, storing results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vscaleps
__m512 _mm512_scale_ps (__m512 a, __m512i b)
Synopsis
__m512 _mm512_scale_ps (__m512 a, __m512i b)
#include "immintrin.h"
Instruction: vscaleps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Scales each single-precision (32-bit) floating-point element in a by multiplying it by 2**exponent, where the exponent is the corresponding 32-bit integer element in b, storing results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vscaleps
__m512 _mm512_mask_scale_round_ps (__m512 src, __mmask16 k, __m512 a, __m512i b, int rounding)
Synopsis
__m512 _mm512_mask_scale_round_ps (__m512 src, __mmask16 k, __m512 a, __m512i b, int rounding)
#include "immintrin.h"
Instruction: vscaleps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Scales each single-precision (32-bit) floating-point element in
a by multiplying it by 2**exp, where the exp is the corresponding 32-bit integer element in
b, storing results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set). Results are rounded using constant
rounding.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_scale_round_ps (__m512 a, __m512i b, int rounding)
Synopsis
__m512 _mm512_scale_round_ps (__m512 a, __m512i b, int rounding)
#include "immintrin.h"
Instruction: vscaleps zmm {k}, zmm, m512
CPUID Flags: KNCNI
Description
Scales each single-precision (32-bit) floating-point element in
a by multiplying it by 2**exponent, where the exponenet is the corresponding 32-bit integer element in
b, storing results in
dst. Intermediate elements are rounded using
rounding.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] * Pow(2, b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vscalefpd
__m128d _mm_mask_scalef_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_scalef_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vscalefpd
__m128d _mm_maskz_scalef_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_scalef_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vscalefpd
__m128d _mm_scalef_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_scalef_pd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 1
i := j*64
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:128] := 0
vscalefpd
__m256d _mm256_mask_scalef_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_scalef_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vscalefpd
__m256d _mm256_maskz_scalef_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_scalef_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vscalefpd
__m256d _mm256_scalef_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_scalef_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vscalefpd
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 3
i := j*64
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:256] := 0
vscalefpd
__m512d _mm512_mask_scalef_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_scalef_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vscalefpd
__m512d _mm512_maskz_scalef_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_scalef_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vscalefpd
__m512d _mm512_scalef_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_scalef_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
vscalefps
__m128 _mm_mask_scalef_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_scalef_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vscalefps
__m128 _mm_maskz_scalef_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_scalef_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vscalefps
__m128 _mm_scalef_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_scalef_ps (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:128] := 0
vscalefps
__m256 _mm256_mask_scalef_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_scalef_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vscalefps
__m256 _mm256_maskz_scalef_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_scalef_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vscalefps
__m256 _mm256_scalef_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_scalef_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vscalefps
CPUID Flags: AVX512VL + AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:256] := 0
vscalefps
__m512 _mm512_mask_scalef_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_scalef_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vscalefps
__m512 _mm512_maskz_scalef_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_scalef_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vscalefps
__m512 _mm512_scalef_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_scalef_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vscalefpd
__m512d _mm512_mask_scalef_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_mask_scalef_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in
a using values from
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vscalefpd
__m512d _mm512_maskz_scalef_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_maskz_scalef_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in
a using values from
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vscalefpd
__m512d _mm512_scalef_round_pd (__m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_scalef_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vscalefpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in
a using values from
b, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SCALE(a[i+63:0], b[i+63:i])
ENDFOR
dst[MAX:512] := 0
vscalefps
__m512 _mm512_mask_scalef_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_mask_scalef_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in
a using values from
b, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vscalefps
__m512 _mm512_maskz_scalef_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_maskz_scalef_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in
a using values from
b, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vscalefps
__m512 _mm512_scalef_round_ps (__m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_scalef_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vscalefps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in
a using values from
b, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[31:0]
}
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SCALE(a[i+31:0], b[i+31:i])
ENDFOR
dst[MAX:512] := 0
vscalefsd
__m128d _mm_mask_scalef_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_scalef_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in
a using values from
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
IF k[0]
dst[63:0] := SCALE(a[63:0], b[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vscalefsd
__m128d _mm_maskz_scalef_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_scalef_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in
a using values from
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
IF k[0]
dst[63:0] := SCALE(a[63:0], b[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vscalefsd
__m128d _mm_scalef_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_scalef_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in
a using values from
b, store the result in the lower element of
dst, and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
dst[63:0] := SCALE(a[63:0], b[63:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vscalefss
__m128 _mm_mask_scalef_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_scalef_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in
a using values from
b, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[63:0]
}
IF k[0]
dst[31:0] := SCALE(a[31:0], b[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vscalefss
__m128 _mm_maskz_scalef_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_scalef_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in
a using values from
b, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[63:0]
}
IF k[0]
dst[31:0] := SCALE(a[31:0], b[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vscalefss
__m128 _mm_scalef_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_scalef_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in
a using values from
b, store the result in the lower element of
dst, and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[63:0]
}
dst[31:0] := SCALE(a[31:0], b[31:0])
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vscalefsd
__m128d _mm_mask_scalef_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_scalef_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
IF k[0]
dst[63:0] := SCALE(a[63:0], b[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vscalefsd
__m128d _mm_maskz_scalef_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_scalef_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
IF k[0]
dst[63:0] := SCALE(a[63:0], b[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vscalefsd
__m128d _mm_scalef_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_scalef_sd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vscalefsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[63:0] := tmp_src1[63:0] * POW(2, FLOOR(tmp_src2[63:0]))
RETURN dst[63:0]
}
dst[63:0] := SCALE(a[63:0], b[63:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vscalefss
__m128 _mm_mask_scalef_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_scalef_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[63:0]
}
IF k[0]
dst[31:0] := SCALE(a[31:0], b[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vscalefss
__m128 _mm_maskz_scalef_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_scalef_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[63:0]
}
IF k[0]
dst[31:0] := SCALE(a[31:0], b[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vscalefss
__m128 _mm_scalef_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_scalef_ss (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vscalefss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
SCALE(src1, src2){
IF (src2 == NaN)
IF (src2 == SNaN)
RETURN QNAN(src2)
FI
ELSE IF (src1 == NaN)
IF (src1 == SNaN)
RETURN QNAN(src1)
FI
IF (src2 != INF)
RETURN QNAN(src1)
FI
ELSE
tmp_src2 := src2
tmp_src1 := src1
IF (src2 is denormal AND MXCSR.DAZ)
tmp_src2 := 0
FI
IF (src1 is denormal AND MXCSR.DAZ)
tmp_src1 := 0
FI
FI
dst[31:0] := tmp_src1[31:0] * POW(2, FLOOR(tmp_src2[31:0]))
RETURN dst[63:0]
}
dst[31:0] := SCALE(a[31:0], b[31:0])
dst[127:32] := b[127:32]
dst[MAX:128] := 0
...
__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
Synopsis
__m128i _mm_set_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed 16-bit integers in dst with the supplied values.
Operation
dst[15:0] := e0
dst[31:16] := e1
dst[47:32] := e2
dst[63:48] := e3
dst[79:64] := e4
dst[95:80] := e5
dst[111:96] := e6
dst[127:112] := e7
...
__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
Synopsis
__m256i _mm256_set_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed 16-bit integers in dst with the supplied values.
Operation
dst[15:0] := e0
dst[31:16] := e1
dst[47:32] := e2
dst[63:48] := e3
dst[79:64] := e4
dst[95:80] := e5
dst[111:96] := e6
dst[127:112] := e7
dst[145:128] := e8
dst[159:144] := e9
dst[175:160] := e10
dst[191:176] := e11
dst[207:192] := e12
dst[223:208] := e13
dst[239:224] := e14
dst[255:240] := e15
dst[MAX:256] := 0
...
__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0)
Synopsis
__m128i _mm_set_epi32 (int e3, int e2, int e1, int e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed 32-bit integers in dst with the supplied values.
Operation
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
...
__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
Synopsis
__m256i _mm256_set_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed 32-bit integers in dst with the supplied values.
Operation
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[MAX:256] := 0
...
__m512i _mm512_set_epi32 (int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
Synopsis
__m512i _mm512_set_epi32 (int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed 32-bit integers in dst with the supplied values.
Operation
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[287:256] := e8
dst[319:288] := e9
dst[351:320] := e10
dst[383:352] := e11
dst[415:384] := e12
dst[447:416] := e13
dst[479:448] := e14
dst[511:480] := e15
dst[MAX:512] := 0
...
__m128i _mm_set_epi64 (__m64 e1, __m64 e0)
Synopsis
__m128i _mm_set_epi64 (__m64 e1, __m64 e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed 64-bit integers in dst with the supplied values.
Operation
dst[63:0] := e0
dst[127:64] := e1
...
__m512i _mm512_set_epi64 (__int64 e7, __int64 e6, __int64 e5, __int64 e4, __int64 e3, __int64 e2, __int64 e1, __int64 e0)
Synopsis
__m512i _mm512_set_epi64 (__int64 e7, __int64 e6, __int64 e5, __int64 e4, __int64 e3, __int64 e2, __int64 e1, __int64 e0)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed 64-bit integers in dst with the supplied values.
Operation
dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[319:256] := e4
dst[383:320] := e5
dst[447:384] := e6
dst[511:448] := e7
dst[MAX:512] := 0
...
__m128i _mm_set_epi64x (__int64 e1, __int64 e0)
Synopsis
__m128i _mm_set_epi64x (__int64 e1, __int64 e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed 64-bit integers in dst with the supplied values.
Operation
dst[63:0] := e0
dst[127:64] := e1
...
__m256i _mm256_set_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0)
Synopsis
__m256i _mm256_set_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed 64-bit integers in dst with the supplied values.
Operation
dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[MAX:256] := 0
...
__m128i _mm_set_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
Synopsis
__m128i _mm_set_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed 8-bit integers in dst with the supplied values in reverse order.
Operation
dst[7:0] := e0
dst[15:8] := e1
dst[23:16] := e2
dst[31:24] := e3
dst[39:32] := e4
dst[47:40] := e5
dst[55:48] := e6
dst[63:56] := e7
dst[71:64] := e8
dst[79:72] := e9
dst[87:80] := e10
dst[95:88] := e11
dst[103:96] := e12
dst[111:104] := e13
dst[119:112] := e14
dst[127:120] := e15
...
__m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
Synopsis
__m256i _mm256_set_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed 8-bit integers in dst with the supplied values in reverse order.
Operation
dst[7:0] := e0
dst[15:8] := e1
dst[23:16] := e2
dst[31:24] := e3
dst[39:32] := e4
dst[47:40] := e5
dst[55:48] := e6
dst[63:56] := e7
dst[71:64] := e8
dst[79:72] := e9
dst[87:80] := e10
dst[95:88] := e11
dst[103:96] := e12
dst[111:104] := e13
dst[119:112] := e14
dst[127:120] := e15
dst[135:128] := e16
dst[143:136] := e17
dst[151:144] := e18
dst[159:152] := e19
dst[167:160] := e20
dst[175:168] := e21
dst[183:176] := e22
dst[191:184] := e23
dst[199:192] := e24
dst[207:200] := e25
dst[215:208] := e26
dst[223:216] := e27
dst[231:224] := e28
dst[239:232] := e29
dst[247:240] := e30
dst[255:248] := e31
dst[MAX:256] := 0
void _MM_SET_EXCEPTION_MASK (unsigned int a)
Synopsis
void _MM_SET_EXCEPTION_MASK (unsigned int a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer a. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT
Operation
MXCSR := a[31:0] AND ~_MM_MASK_MASK
void _MM_SET_EXCEPTION_STATE (unsigned int a)
Synopsis
void _MM_SET_EXCEPTION_STATE (unsigned int a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer a. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT
Operation
MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK
void _MM_SET_FLUSH_ZERO_MODE (unsigned int a)
Synopsis
void _MM_SET_FLUSH_ZERO_MODE (unsigned int a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer a. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
Operation
MXCSR := a[31:0] AND ~_MM_FLUSH_MASK
vinsertf128
__m256 _mm256_set_m128 (__m128 hi, __m128 lo)
Synopsis
__m256 _mm256_set_m128 (__m128 hi, __m128 lo)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX
Description
Set packed __m256 vector dst with the supplied values.
Operation
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
Performance
vinsertf128
__m256d _mm256_set_m128d (__m128d hi, __m128d lo)
Synopsis
__m256d _mm256_set_m128d (__m128d hi, __m128d lo)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX
Description
Set packed __m256d vector dst with the supplied values.
Operation
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
Performance
vinsertf128
__m256i _mm256_set_m128i (__m128i hi, __m128i lo)
Synopsis
__m256i _mm256_set_m128i (__m128i hi, __m128i lo)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX
Description
Set packed __m256i vector dst with the supplied values.
Operation
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
Performance
...
__m128d _mm_set_pd (double e1, double e0)
Synopsis
__m128d _mm_set_pd (double e1, double e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
Operation
dst[63:0] := e0
dst[127:64] := e1
...
__m256d _mm256_set_pd (double e3, double e2, double e1, double e0)
Synopsis
__m256d _mm256_set_pd (double e3, double e2, double e1, double e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
Operation
dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[MAX:256] := 0
...
__m512d _mm512_set_pd (double e7, double e6, double e5, double e4, double e3, double e2, double e1, double e0)
Synopsis
__m512d _mm512_set_pd (double e7, double e6, double e5, double e4, double e3, double e2, double e1, double e0)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
Operation
dst[63:0] := e0
dst[127:64] := e1
dst[191:128] := e2
dst[255:192] := e3
dst[319:256] := e4
dst[383:320] := e5
dst[447:384] := e6
dst[511:448] := e7
dst[MAX:512] := 0
...
__m128d _mm_set_pd1 (double a)
Synopsis
__m128d _mm_set_pd1 (double a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Broadcast double-precision (64-bit) floating-point value a to all elements of dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
...
__m128 _mm_set_ps (float e3, float e2, float e1, float e0)
Synopsis
__m128 _mm_set_ps (float e3, float e2, float e1, float e0)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Set packed single-precision (32-bit) floating-point elements in dst with the supplied values.
Operation
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
...
__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
Synopsis
__m256 _mm256_set_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed single-precision (32-bit) floating-point elements in dst with the supplied values.
Operation
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[MAX:256] := 0
...
__m512 _mm512_set_ps (float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
Synopsis
__m512 _mm512_set_ps (float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed single-precision (32-bit) floating-point elements in dst with the supplied values.
Operation
dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3
dst[159:128] := e4
dst[191:160] := e5
dst[223:192] := e6
dst[255:224] := e7
dst[287:256] := e8
dst[319:288] := e9
dst[351:320] := e10
dst[383:352] := e11
dst[415:384] := e12
dst[447:416] := e13
dst[479:448] := e14
dst[511:480] := e15
dst[MAX:512] := 0
...
__m128 _mm_set_ps1 (float a)
Synopsis
__m128 _mm_set_ps1 (float a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Broadcast single-precision (32-bit) floating-point value a to all elements of dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
void _MM_SET_ROUNDING_MODE (unsigned int a)
Synopsis
void _MM_SET_ROUNDING_MODE (unsigned int a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer a. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
Operation
MXCSR := a[31:0] AND ~_MM_ROUND_MASK
...
__m128d _mm_set_sd (double a)
Synopsis
__m128d _mm_set_sd (double a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Copy double-precision (64-bit) floating-point element a to the lower element of dst, and zero the upper element.
Operation
dst[63:0] := a[63:0]
dst[127:64] := 0
...
__m128 _mm_set_ss (float a)
Synopsis
__m128 _mm_set_ss (float a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Copy single-precision (32-bit) floating-point element a to the lower element of dst, and zero the upper 3 elements.
Operation
dst[31:0] := a[31:0]
dst[127:32] := 0
vpbroadcastw
__m128i _mm_mask_set1_epi16 (__m128i src, __mmask8 k, short a)
Synopsis
__m128i _mm_mask_set1_epi16 (__m128i src, __mmask8 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastw
__m128i _mm_maskz_set1_epi16 (__mmask8 k, short a)
Synopsis
__m128i _mm_maskz_set1_epi16 (__mmask8 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
...
__m128i _mm_set1_epi16 (short a)
Synopsis
__m128i _mm_set1_epi16 (short a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Broadcast 16-bit integer a to all all elements of dst. This intrinsic may generate vpbroadcastw.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := a[15:0]
ENDFOR
vpbroadcastw
__m256i _mm256_mask_set1_epi16 (__m256i src, __mmask16 k, short a)
Synopsis
__m256i _mm256_mask_set1_epi16 (__m256i src, __mmask16 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastw
__m256i _mm256_maskz_set1_epi16 (__mmask16 k, short a)
Synopsis
__m256i _mm256_maskz_set1_epi16 (__mmask16 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast 16-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
...
__m256i _mm256_set1_epi16 (short a)
Synopsis
__m256i _mm256_set1_epi16 (short a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Broadcast 16-bit integer a to all all elements of dst. This intrinsic may generate the vpbroadcastw.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:256] := 0
vpbroadcastw
__m512i _mm512_mask_set1_epi16 (__m512i src, __mmask32 k, short a)
Synopsis
__m512i _mm512_mask_set1_epi16 (__m512i src, __mmask32 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW
Description
Broadcast 16-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastw
__m512i _mm512_maskz_set1_epi16 (__mmask32 k, short a)
Synopsis
__m512i _mm512_maskz_set1_epi16 (__mmask32 k, short a)
#include "immintrin.h"
Instruction: vpbroadcastw
CPUID Flags: AVX512BW
Description
Broadcast the low packed 16-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[15:0]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_set1_epi16 (short a)
Synopsis
__m512i _mm512_set1_epi16 (short a)
#include "immintrin.h"
Instruction: vpbroadcastw ymm, xmm
CPUID Flags: AVX512F
Description
Broadcast the low packed 16-bit integer from a to all all elements of dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:512] := 0
vpbroadcastd
__m128i _mm_mask_set1_epi32 (__m128i src, __mmask8 k, int a)
Synopsis
__m128i _mm_mask_set1_epi32 (__m128i src, __mmask8 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastd
__m128i _mm_maskz_set1_epi32 (__mmask8 k, int a)
Synopsis
__m128i _mm_maskz_set1_epi32 (__mmask8 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
...
__m128i _mm_set1_epi32 (int a)
Synopsis
__m128i _mm_set1_epi32 (int a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Broadcast 32-bit integer a to all elements of dst. This intrinsic may generate vpbroadcastd.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
vpbroadcastd
__m256i _mm256_mask_set1_epi32 (__m256i src, __mmask8 k, int a)
Synopsis
__m256i _mm256_mask_set1_epi32 (__m256i src, __mmask8 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastd
__m256i _mm256_maskz_set1_epi32 (__mmask8 k, int a)
Synopsis
__m256i _mm256_maskz_set1_epi32 (__mmask8 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
...
__m256i _mm256_set1_epi32 (int a)
Synopsis
__m256i _mm256_set1_epi32 (int a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Broadcast 32-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastd.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0
vpbroadcastd
__m512i _mm512_mask_set1_epi32 (__m512i src, __mmask16 k, int a)
Synopsis
__m512i _mm512_mask_set1_epi32 (__m512i src, __mmask16 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, r32
CPUID Flags: AVX512F
Description
Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastd
__m512i _mm512_maskz_set1_epi32 (__mmask16 k, int a)
Synopsis
__m512i _mm512_maskz_set1_epi32 (__mmask16 k, int a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, r32
CPUID Flags: AVX512F
Description
Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[31:0]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastd
__m512i _mm512_set1_epi32 (int a)
Synopsis
__m512i _mm512_set1_epi32 (int a)
#include "immintrin.h"
Instruction: vpbroadcastd zmm {k}, r32
CPUID Flags: AVX512F
Description
Broadcast 32-bit integer a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0
vpbroadcastq
__m128i _mm_mask_set1_epi64 (__m128i src, __mmask8 k, __int64 a)
Synopsis
__m128i _mm_mask_set1_epi64 (__m128i src, __mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastq
__m128i _mm_maskz_set1_epi64 (__mmask8 k, __int64 a)
Synopsis
__m128i _mm_maskz_set1_epi64 (__mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
...
__m128i _mm_set1_epi64 (__m64 a)
Synopsis
__m128i _mm_set1_epi64 (__m64 a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Broadcast 64-bit integer a to all elements of dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
vpbroadcastq
__m256i _mm256_mask_set1_epi64 (__m256i src, __mmask8 k, __int64 a)
Synopsis
__m256i _mm256_mask_set1_epi64 (__m256i src, __mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastq
__m256i _mm256_maskz_set1_epi64 (__mmask8 k, __int64 a)
Synopsis
__m256i _mm256_maskz_set1_epi64 (__mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq
CPUID Flags: AVX512VL + AVX512F
Description
Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastq
__m512i _mm512_mask_set1_epi64 (__m512i src, __mmask8 k, __int64 a)
Synopsis
__m512i _mm512_mask_set1_epi64 (__m512i src, __mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, r64
CPUID Flags: AVX512F
Description
Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastq
__m512i _mm512_maskz_set1_epi64 (__mmask8 k, __int64 a)
Synopsis
__m512i _mm512_maskz_set1_epi64 (__mmask8 k, __int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, r64
CPUID Flags: AVX512F
Description
Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[63:0]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastq
__m512i _mm512_set1_epi64 (__int64 a)
Synopsis
__m512i _mm512_set1_epi64 (__int64 a)
#include "immintrin.h"
Instruction: vpbroadcastq zmm {k}, r64
CPUID Flags: AVX512F
Description
Broadcast 64-bit integer a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0
...
__m128i _mm_set1_epi64x (__int64 a)
Synopsis
__m128i _mm_set1_epi64x (__int64 a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Broadcast 64-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastq.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
...
__m256i _mm256_set1_epi64x (long long a)
Synopsis
__m256i _mm256_set1_epi64x (long long a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Broadcast 64-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastq.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0
vpbroadcastb
__m128i _mm_mask_set1_epi8 (__m128i src, __mmask16 k, char a)
Synopsis
__m128i _mm_mask_set1_epi8 (__m128i src, __mmask16 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpbroadcastb
__m128i _mm_maskz_set1_epi8 (__mmask16 k, char a)
Synopsis
__m128i _mm_maskz_set1_epi8 (__mmask16 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
...
__m128i _mm_set1_epi8 (char a)
Synopsis
__m128i _mm_set1_epi8 (char a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Broadcast 8-bit integer a to all elements of dst. This intrinsic may generate vpbroadcastb.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := a[7:0]
ENDFOR
vpbroadcastb
__m256i _mm256_mask_set1_epi8 (__m256i src, __mmask32 k, char a)
Synopsis
__m256i _mm256_mask_set1_epi8 (__m256i src, __mmask32 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpbroadcastb
__m256i _mm256_maskz_set1_epi8 (__mmask32 k, char a)
Synopsis
__m256i _mm256_maskz_set1_epi8 (__mmask32 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512VL + AVX512BW
Description
Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
...
__m256i _mm256_set1_epi8 (char a)
Synopsis
__m256i _mm256_set1_epi8 (char a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Broadcast 8-bit integer a to all elements of dst. This intrinsic may generate the vpbroadcastb.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:256] := 0
vpbroadcastb
__m512i _mm512_mask_set1_epi8 (__m512i src, __mmask64 k, char a)
Synopsis
__m512i _mm512_mask_set1_epi8 (__m512i src, __mmask64 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW
Description
Broadcast 8-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpbroadcastb
__m512i _mm512_maskz_set1_epi8 (__mmask64 k, char a)
Synopsis
__m512i _mm512_maskz_set1_epi8 (__mmask64 k, char a)
#include "immintrin.h"
Instruction: vpbroadcastb
CPUID Flags: AVX512BW
Description
Broadcast 8-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[7:0]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_set1_epi8 (char a)
Synopsis
__m512i _mm512_set1_epi8 (char a)
#include "immintrin.h"
Instruction: vpbroadcastb ymm, xmm
CPUID Flags: AVX512F
Description
Broadcast 8-bit integer a to all elements of dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_set1_pd (double a)
Synopsis
__m128d _mm_set1_pd (double a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Broadcast double-precision (64-bit) floating-point value a to all elements of dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
...
__m256d _mm256_set1_pd (double a)
Synopsis
__m256d _mm256_set1_pd (double a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Broadcast double-precision (64-bit) floating-point value a to all elements of dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_set1_pd (double a)
Synopsis
__m512d _mm512_set1_pd (double a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Broadcast double-precision (64-bit) floating-point value a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_set1_ps (float a)
Synopsis
__m128 _mm_set1_ps (float a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Broadcast single-precision (32-bit) floating-point value a to all elements of dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
...
__m256 _mm256_set1_ps (float a)
Synopsis
__m256 _mm256_set1_ps (float a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Broadcast single-precision (32-bit) floating-point value a to all elements of dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_set1_ps (float a)
Synopsis
__m512 _mm512_set1_ps (float a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Broadcast single-precision (32-bit) floating-point value a to all elements of dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:512] := 0
...
__m512i _mm512_set4_epi32 (int d, int c, int b, int a)
Synopsis
__m512i _mm512_set4_epi32 (int d, int c, int b, int a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed 32-bit integers in dst with the repeated 4 element sequence.
Operation
dst[31:0] := d
dst[63:32] := c
dst[95:64] := b
dst[127:96] := a
dst[159:128] := d
dst[191:160] := c
dst[223:192] := b
dst[255:224] := a
dst[287:256] := d
dst[319:288] := c
dst[351:320] := b
dst[383:352] := a
dst[415:384] := d
dst[447:416] := c
dst[479:448] := b
dst[511:480] := a
dst[MAX:512] := 0
...
__m512i _mm512_set4_epi64 (__int64 d, __int64 c, __int64 b, __int64 a)
Synopsis
__m512i _mm512_set4_epi64 (__int64 d, __int64 c, __int64 b, __int64 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed 64-bit integers in dst with the repeated 4 element sequence.
Operation
dst[63:0] := d
dst[127:64] := c
dst[191:128] := b
dst[255:192] := a
dst[319:256] := d
dst[383:320] := c
dst[447:384] := b
dst[511:448] := a
dst[MAX:512] := 0
...
__m512d _mm512_set4_pd (double d, double c, double b, double a)
Synopsis
__m512d _mm512_set4_pd (double d, double c, double b, double a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence.
Operation
dst[63:0] := d
dst[127:64] := c
dst[191:128] := b
dst[255:192] := a
dst[319:256] := d
dst[383:320] := c
dst[447:384] := b
dst[511:448] := a
dst[MAX:512] := 0
...
__m512 _mm512_set4_ps (float d, float c, float b, float a)
Synopsis
__m512 _mm512_set4_ps (float d, float c, float b, float a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence.
Operation
dst[31:0] := d
dst[63:32] := c
dst[95:64] := b
dst[127:96] := a
dst[159:128] := d
dst[191:160] := c
dst[223:192] := b
dst[255:224] := a
dst[287:256] := d
dst[319:288] := c
dst[351:320] := b
dst[383:352] := a
dst[415:384] := d
dst[447:416] := c
dst[479:448] := b
dst[511:480] := a
dst[MAX:512] := 0
ldmxcsr
void _mm_setcsr (unsigned int a)
Synopsis
void _mm_setcsr (unsigned int a)
#include "xmmintrin.h"
Instruction: ldmxcsr MEMd
CPUID Flags: SSE
Description
Set the MXCSR control and status register with the value in unsigned 32-bit integer a.
Operation
MXCSR := a[31:0]
Performance
...
__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
Synopsis
__m128i _mm_setr_epi16 (short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed 16-bit integers in dst with the supplied values in reverse order.
Operation
dst[15:0] := e7
dst[31:16] := e6
dst[47:32] := e5
dst[63:48] := e4
dst[79:64] := e3
dst[95:80] := e2
dst[111:96] := e1
dst[127:112] := e0
...
__m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
Synopsis
__m256i _mm256_setr_epi16 (short e15, short e14, short e13, short e12, short e11, short e10, short e9, short e8, short e7, short e6, short e5, short e4, short e3, short e2, short e1, short e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed 16-bit integers in dst with the supplied values in reverse order.
Operation
dst[15:0] := e15
dst[31:16] := e14
dst[47:32] := e13
dst[63:48] := e12
dst[79:64] := e11
dst[95:80] := e10
dst[111:96] := e9
dst[127:112] := e8
dst[145:128] := e7
dst[159:144] := e6
dst[175:160] := e5
dst[191:176] := e4
dst[207:192] := e3
dst[223:208] := e2
dst[239:224] := e1
dst[255:240] := e0
dst[MAX:256] := 0
...
__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0)
Synopsis
__m128i _mm_setr_epi32 (int e3, int e2, int e1, int e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed 32-bit integers in dst with the supplied values in reverse order.
Operation
dst[31:0] := e3
dst[63:32] := e2
dst[95:64] := e1
dst[127:96] := e0
...
__m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
Synopsis
__m256i _mm256_setr_epi32 (int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed 32-bit integers in dst with the supplied values in reverse order.
Operation
dst[31:0] := e7
dst[63:32] := e6
dst[95:64] := e5
dst[127:96] := e4
dst[159:128] := e3
dst[191:160] := e2
dst[223:192] := e1
dst[255:224] := e0
dst[MAX:256] := 0
...
__m512i _mm512_setr_epi32 (int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
Synopsis
__m512i _mm512_setr_epi32 (int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed 32-bit integers in dst with the supplied values in reverse order.
Operation
dst[31:0] := e15
dst[63:32] := e14
dst[95:64] := e13
dst[127:96] := e12
dst[159:128] := e11
dst[191:160] := e10
dst[223:192] := e9
dst[255:224] := e8
dst[287:256] := e7
dst[319:288] := e6
dst[351:320] := e5
dst[383:352] := e4
dst[415:384] := e3
dst[447:416] := e2
dst[479:448] := e1
dst[511:480] := e0
dst[MAX:512] := 0
...
__m128i _mm_setr_epi64 (__m64 e1, __m64 e0)
Synopsis
__m128i _mm_setr_epi64 (__m64 e1, __m64 e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed 64-bit integers in dst with the supplied values in reverse order.
Operation
dst[63:0] := e1
dst[127:64] := e0
...
__m512i _mm512_setr_epi64 (__int64 e7, __int64 e6, __int64 e5, __int64 e4, __int64 e3, __int64 e2, __int64 e1, __int64 e0)
Synopsis
__m512i _mm512_setr_epi64 (__int64 e7, __int64 e6, __int64 e5, __int64 e4, __int64 e3, __int64 e2, __int64 e1, __int64 e0)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed 64-bit integers in dst with the supplied values in reverse order.
Operation
dst[63:0] := e7
dst[127:64] := e6
dst[191:128] := e5
dst[255:192] := e4
dst[319:256] := e3
dst[383:320] := e2
dst[447:384] := e1
dst[511:448] := e0
dst[MAX:512] := 0
...
__m256i _mm256_setr_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0)
Synopsis
__m256i _mm256_setr_epi64x (__int64 e3, __int64 e2, __int64 e1, __int64 e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed 64-bit integers in dst with the supplied values in reverse order.
Operation
dst[63:0] := e3
dst[127:64] := e2
dst[191:128] := e1
dst[255:192] := e0
dst[MAX:256] := 0
...
__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
Synopsis
__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed 8-bit integers in dst with the supplied values in reverse order.
Operation
dst[7:0] := e15
dst[15:8] := e14
dst[23:16] := e13
dst[31:24] := e12
dst[39:32] := e11
dst[47:40] := e10
dst[55:48] := e9
dst[63:56] := e8
dst[71:64] := e7
dst[79:72] := e6
dst[87:80] := e5
dst[95:88] := e4
dst[103:96] := e3
dst[111:104] := e2
dst[119:112] := e1
dst[127:120] := e0
...
__m256i _mm256_setr_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
Synopsis
__m256i _mm256_setr_epi8 (char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed 8-bit integers in dst with the supplied values in reverse order.
Operation
dst[7:0] := e31
dst[15:8] := e30
dst[23:16] := e29
dst[31:24] := e28
dst[39:32] := e27
dst[47:40] := e26
dst[55:48] := e25
dst[63:56] := e24
dst[71:64] := e23
dst[79:72] := e22
dst[87:80] := e21
dst[95:88] := e20
dst[103:96] := e19
dst[111:104] := e18
dst[119:112] := e17
dst[127:120] := e16
dst[135:128] := e15
dst[143:136] := e14
dst[151:144] := e13
dst[159:152] := e12
dst[167:160] := e11
dst[175:168] := e10
dst[183:176] := e9
dst[191:184] := e8
dst[199:192] := e7
dst[207:200] := e6
dst[215:208] := e5
dst[223:216] := e4
dst[231:224] := e3
dst[239:232] := e2
dst[247:240] := e1
dst[255:248] := e0
dst[MAX:256] := 0
vinsertf128
__m256 _mm256_setr_m128 (__m128 lo, __m128 hi)
Synopsis
__m256 _mm256_setr_m128 (__m128 lo, __m128 hi)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX
Description
Set packed __m256 vector dst with the supplied values.
Operation
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
Performance
vinsertf128
__m256d _mm256_setr_m128d (__m128d lo, __m128d hi)
Synopsis
__m256d _mm256_setr_m128d (__m128d lo, __m128d hi)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX
Description
Set packed __m256d vector dst with the supplied values.
Operation
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
Performance
vinsertf128
__m256i _mm256_setr_m128i (__m128i lo, __m128i hi)
Synopsis
__m256i _mm256_setr_m128i (__m128i lo, __m128i hi)
#include "immintrin.h"
Instruction: vinsertf128 ymm, ymm, xmm, imm
CPUID Flags: AVX
Description
Set packed __m256i vector dst with the supplied values.
Operation
dst[127:0] := lo[127:0]
dst[255:128] := hi[127:0]
dst[MAX:256] := 0
Performance
...
__m128d _mm_setr_pd (double e1, double e0)
Synopsis
__m128d _mm_setr_pd (double e1, double e0)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
Operation
dst[63:0] := e1
dst[127:64] := e0
...
__m256d _mm256_setr_pd (double e3, double e2, double e1, double e0)
Synopsis
__m256d _mm256_setr_pd (double e3, double e2, double e1, double e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
Operation
dst[63:0] := e3
dst[127:64] := e2
dst[191:128] := e1
dst[255:192] := e0
dst[MAX:256] := 0
...
__m512d _mm512_setr_pd (double e7, double e6, double e5, double e4, double e3, double e2, double e1, double e0)
Synopsis
__m512d _mm512_setr_pd (double e7, double e6, double e5, double e4, double e3, double e2, double e1, double e0)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
Operation
dst[63:0] := e7
dst[127:64] := e6
dst[191:128] := e5
dst[255:192] := e4
dst[319:256] := e3
dst[383:320] := e2
dst[447:384] := e1
dst[511:448] := e0
dst[MAX:512] := 0
...
__m128 _mm_setr_ps (float e3, float e2, float e1, float e0)
Synopsis
__m128 _mm_setr_ps (float e3, float e2, float e1, float e0)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Set packed single-precision (32-bit) floating-point elements in dst with the supplied values in reverse order.
Operation
dst[31:0] := e3
dst[63:32] := e2
dst[95:64] := e1
dst[127:96] := e0
...
__m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
Synopsis
__m256 _mm256_setr_ps (float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
#include "immintrin.h"
CPUID Flags: AVX
Description
Set packed single-precision (32-bit) floating-point elements in dst with the supplied values in reverse order.
Operation
dst[31:0] := e7
dst[63:32] := e6
dst[95:64] := e5
dst[127:96] := e4
dst[159:128] := e3
dst[191:160] := e2
dst[223:192] := e1
dst[255:224] := e0
dst[MAX:256] := 0
...
__m512 _mm512_setr_ps (float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
Synopsis
__m512 _mm512_setr_ps (float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed single-precision (32-bit) floating-point elements in dst with the supplied values in reverse order.
Operation
dst[31:0] := e15
dst[63:32] := e14
dst[95:64] := e13
dst[127:96] := e12
dst[159:128] := e11
dst[191:160] := e10
dst[223:192] := e9
dst[255:224] := e8
dst[287:256] := e7
dst[319:288] := e6
dst[351:320] := e5
dst[383:352] := e4
dst[415:384] := e3
dst[447:416] := e2
dst[479:448] := e1
dst[511:480] := e0
dst[MAX:512] := 0
...
__m512i _mm512_setr4_epi32 (int d, int c, int b, int a)
Synopsis
__m512i _mm512_setr4_epi32 (int d, int c, int b, int a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed 32-bit integers in dst with the repeated 4 element sequence in reverse order.
Operation
dst[31:0] := a
dst[63:32] := b
dst[95:64] := c
dst[127:96] := d
dst[159:128] := a
dst[191:160] := b
dst[223:192] := c
dst[255:224] := d
dst[287:256] := a
dst[319:288] := b
dst[351:320] := c
dst[383:352] := d
dst[415:384] := a
dst[447:416] := b
dst[479:448] := c
dst[511:480] := d
dst[MAX:512] := 0
...
__m512i _mm512_setr4_epi64 (__int64 d, __int64 c, __int64 b, __int64 a)
Synopsis
__m512i _mm512_setr4_epi64 (__int64 d, __int64 c, __int64 b, __int64 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed 64-bit integers in dst with the repeated 4 element sequence in reverse order.
Operation
dst[63:0] := a
dst[127:64] := b
dst[191:128] := c
dst[255:192] := d
dst[319:256] := a
dst[383:320] := b
dst[447:384] := c
dst[511:448] := d
dst[MAX:512] := 0
...
__m512d _mm512_setr4_pd (double d, double c, double b, double a)
Synopsis
__m512d _mm512_setr4_pd (double d, double c, double b, double a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
Operation
dst[63:0] := a
dst[127:64] := b
dst[191:128] := c
dst[255:192] := d
dst[319:256] := a
dst[383:320] := b
dst[447:384] := c
dst[511:448] := d
dst[MAX:512] := 0
...
__m512 _mm512_setr4_ps (float d, float c, float b, float a)
Synopsis
__m512 _mm512_setr4_ps (float d, float c, float b, float a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
Operation
dst[31:0] := a
dst[63:32] := b
dst[95:64] := c
dst[127:96] := d
dst[159:128] := a
dst[191:160] := b
dst[223:192] := c
dst[255:224] := d
dst[287:256] := a
dst[319:288] := b
dst[351:320] := c
dst[383:352] := d
dst[415:384] := a
dst[447:416] := b
dst[479:448] := c
dst[511:480] := d
dst[MAX:512] := 0
vpxorq
__m512 _mm512_setzero (void)
Synopsis
__m512 _mm512_setzero (void)
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Return vector of type __m512 with all elements set to zero.
Operation
dst[MAX:0] := 0
vpxorq
__m512i _mm512_setzero_epi32 ()
Synopsis
__m512i _mm512_setzero_epi32 ()
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Return vector of type __m512i with all elements set to zero.
Operation
dst[MAX:0] := 0
xorpd
__m128d _mm_setzero_pd (void)
Synopsis
__m128d _mm_setzero_pd (void)
#include "emmintrin.h"
Instruction: xorpd xmm, xmm
CPUID Flags: SSE2
Description
Return vector of type __m128d with all elements set to zero.
Operation
dst[MAX:0] := 0
Performance
vxorpd
__m256d _mm256_setzero_pd (void)
Synopsis
__m256d _mm256_setzero_pd (void)
#include "immintrin.h"
Instruction: vxorpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Return vector of type __m256d with all elements set to zero.
Operation
dst[MAX:0] := 0
Performance
vpxorq
__m512d _mm512_setzero_pd ()
Synopsis
__m512d _mm512_setzero_pd ()
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Return vector of type __m512d with all elements set to zero.
Operation
dst[MAX:0] := 0
xorps
__m128 _mm_setzero_ps (void)
Synopsis
__m128 _mm_setzero_ps (void)
#include "xmmintrin.h"
Instruction: xorps xmm, xmm
CPUID Flags: SSE
Description
Return vector of type __m128 with all elements set to zero.
Operation
dst[MAX:0] := 0
Performance
vxorps
__m256 _mm256_setzero_ps (void)
Synopsis
__m256 _mm256_setzero_ps (void)
#include "immintrin.h"
Instruction: vxorps ymm, ymm, ymm
CPUID Flags: AVX
Description
Return vector of type __m256 with all elements set to zero.
Operation
dst[MAX:0] := 0
Performance
vpxorq
__m512 _mm512_setzero_ps ()
Synopsis
__m512 _mm512_setzero_ps ()
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Return vector of type __m512 with all elements set to zero.
Operation
dst[MAX:0] := 0
pxor
__m128i _mm_setzero_si128 ()
Synopsis
__m128i _mm_setzero_si128 ()
#include "emmintrin.h"
Instruction: pxor xmm, xmm
CPUID Flags: SSE2
Description
Return vector of type __m128i with all elements set to zero.
Operation
dst[MAX:0] := 0
Performance
vpxor
__m256i _mm256_setzero_si256 (void)
Synopsis
__m256i _mm256_setzero_si256 (void)
#include "immintrin.h"
Instruction: vpxor ymm, ymm, ymm
CPUID Flags: AVX
Description
Return vector of type __m256i with all elements set to zero.
Operation
dst[MAX:0] := 0
Performance
vpxorq
__m512i _mm512_setzero_si512 ()
Synopsis
__m512i _mm512_setzero_si512 ()
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Return vector of type __m512i with all elements set to zero.
Operation
dst[MAX:0] := 0
sfence
void _mm_sfence (void)
Synopsis
void _mm_sfence (void)
#include "xmmintrin.h"
Instruction: sfence
CPUID Flags: SSE
Description
Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order.
sha1msg1
__m128i _mm_sha1msg1_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sha1msg1_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha1msg1 xmm, xmm
CPUID Flags: SHA
Description
Perform an intermediate calculation for the next four SHA1 message values (unsigned 32-bit integers) using previous message values from a and b, and store the result in dst.
Operation
W0 := a[127:96];
W1 := a[95:64];
W2 := a[63:32];
W3 := a[31:0];
W4 := b[127:96];
W5 := b[95:64];
dst[127:96] := W2 XOR W0;
dst[95:64] := W3 XOR W1;
dst[63:32] := W4 XOR W2;
dst[31:0] := W5 XOR W3;
sha1msg2
__m128i _mm_sha1msg2_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sha1msg2_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha1msg2 xmm, xmm
CPUID Flags: SHA
Description
Perform the final calculation for the next four SHA1 message values (unsigned 32-bit integers) using the intermediate result in a and the previous message values in b, and store the result in dst.
Operation
W13 := b[95:64];
W14 := b[63:32];
W15 := b[31:0];
W16 := (a[127:96] XOR W13) <<< 1;
W17 := (a[95:64] XOR W14) <<< 1;
W18 := (a[63:32] XOR W15) <<< 1;
W19 := (a[31:0] XOR W16) <<< 1;
dst[127:96] := W16;
dst[95:64] := W17;
dst[63:32] := W18;
dst[31:0] := W19;
sha1nexte
__m128i _mm_sha1nexte_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sha1nexte_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha1nexte xmm, xmm
CPUID Flags: SHA
Description
Calculate SHA1 state variable E after four rounds of operation from the current SHA1 state variable a, add that value to the scheduled values (unsigned 32-bit integers) in b, and store the result in dst.
Operation
tmp := (a[127:96] <<< 30);
dst[127:96] := b[127:96] + tmp;
dst[95:64] := b[95:64];
dst[63:32] := b[63:32];
dst[31:0] := b[31:0];
sha1rnds4
__m128i _mm_sha1rnds4_epu32 (__m128i a, __m128i b, const int func)
Synopsis
__m128i _mm_sha1rnds4_epu32 (__m128i a, __m128i b, const int func)
#include "immintrin.h"
Instruction: sha1rnds4 xmm, xmm, imm
CPUID Flags: SHA
Description
Perform four rounds of SHA1 operation using an initial SHA1 state (A,B,C,D) from a and some pre-computed sum of the next 4 round message values (unsigned 32-bit integers), and state variable E from b, and store the updated SHA1 state (A,B,C,D) in dst. func contains the logic functions and round constants.
Operation
IF (func[1:0] = 0) THEN
f() := f0(), K := K0;
ELSE IF (func[1:0] = 1) THEN
f() := f1(), K := K1;
ELSE IF (func[1:0] = 2) THEN
f() := f2(), K := K2;
ELSE IF (func[1:0] = 3) THEN
f() := f3(), K := K3;
FI;
A := a[127:96];
B := a[95:64];
C := a[63:32];
D := a[31:0];
W[0] := b[127:96];
W[1] := b[95:64];
W[2] := b[63:32];
W[3] := b[31:0];
A[1] := f(B, C, D) + (A <<< 5) + W[0] + K;
B[1] := A;
C[1] := B <<< 30;
D[1] := C;
E[1] := D;
FOR i = 1 to 3
A[i+1] := f(B[i], C[i], D[i]) + (A[i] <<< 5) + W[i] + E[i] + K;
B[i+1] := A[i];
C[i+1] := B[i] <<< 30;
D[i+1] := C[i];
E[i+1] := D[i];
ENDFOR;
dst[127:96] := A[4];
dst[95:64] := B[4];
dst[63:32] := C[4];
dst[31:0] := D[4];
sha256msg1
__m128i _mm_sha256msg1_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sha256msg1_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha256msg1 xmm, xmm
CPUID Flags: SHA
Description
Perform an intermediate calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from a and b, and store the result in dst.
Operation
W4 := b[31:0];
W3 := a[127:96];
W2 := a[95:64];
W1 := a[63:32];
W0 := a[31:0];
dst[127:96] := W3 + sigma0(W4);
dst[95:64] := W2 + sigma0(W3);
dst[63:32] := W1 + sigma0(W2);
dst[31:0] := W0 + sigma0(W1);
sha256msg2
__m128i _mm_sha256msg2_epu32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sha256msg2_epu32 (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: sha256msg2 xmm, xmm
CPUID Flags: SHA
Description
Perform the final calculation for the next four SHA256 message values (unsigned 32-bit integers) using previous message values from a and b, and store the result in dst."
Operation
W14 := b[95:64];
W15 := b[127:96];
W16 := a[31:0] + sigma1(W14);
W17 := a[63:32] + sigma1(W15);
W18 := a[95:64] + sigma1(W16);
W19 := a[127:96] + sigma1(W17);
dst[127:96] := W19;
dst[95:64] := W18;
dst[63:32] := W17;
dst[31:0] := W16;
sha256rnds2
__m128i _mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)
Synopsis
__m128i _mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)
#include "immintrin.h"
Instruction: sha256rnds2 xmm, xmm
CPUID Flags: SHA
Description
Perform 2 rounds of SHA256 operation using an initial SHA256 state (C,D,G,H) from a, an initial SHA256 state (A,B,E,F) from b, and a pre-computed sum of the next 2 round message values (unsigned 32-bit integers) and the corresponding round constants from k, and store the updated SHA256 state (A,B,E,F) in dst.
Operation
A[0] := b[127:96];
B[0] := b[95:64];
C[0] := a[127:96];
D[0] := a[95:64];
E[0] := b[63:32];
F[0] := b[31:0];
G[0] := a[63:32];
H[0] := a[31:0];
W_K0 := k[31:0];
W_K1 := k[63:32];
FOR i = 0 to 1
A_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + Maj(A[i], B[i], C[i]) + sum0(A[i]);
B_(i+1) := A[i];
C_(i+1) := B[i];
D_(i+1) := C[i];
E_(i+1) := Ch(E[i], F[i], G[i]) + sum1(E[i]) + WKi + H[i] + D[i];
F_(i+1) := E[i];
G_(i+1) := F[i];
H_(i+1) := G[i];
ENDFOR;
dst[127:96] := A[2];
dst[95:64] := B[2];
dst[63:32] := E[2];
dst[31:0] := F[2];
vpshufd
__m128i _mm_mask_shuffle_epi32 (__m128i src, __mmask8 k, __m128i a, _MM_PERM_ENUM imm8)
Synopsis
__m128i _mm_mask_shuffle_epi32 (__m128i src, __mmask8 k, __m128i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpshufd
__m128i _mm_maskz_shuffle_epi32 (__mmask8 k, __m128i a, _MM_PERM_ENUM imm8)
Synopsis
__m128i _mm_maskz_shuffle_epi32 (__mmask8 k, __m128i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pshufd
__m128i _mm_shuffle_epi32 (__m128i a, int imm8)
Synopsis
__m128i _mm_shuffle_epi32 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pshufd xmm, xmm, imm
CPUID Flags: SSE2
Description
Shuffle 32-bit integers in a using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
Performance
vpshufd
__m256i _mm256_mask_shuffle_epi32 (__m256i src, __mmask8 k, __m256i a, _MM_PERM_ENUM imm8)
Synopsis
__m256i _mm256_mask_shuffle_epi32 (__m256i src, __mmask8 k, __m256i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpshufd
__m256i _mm256_maskz_shuffle_epi32 (__mmask8 k, __m256i a, _MM_PERM_ENUM imm8)
Synopsis
__m256i _mm256_maskz_shuffle_epi32 (__mmask8 k, __m256i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpshufd
__m256i _mm256_shuffle_epi32 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_shuffle_epi32 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpshufd ymm, ymm, imm
CPUID Flags: AVX2
Description
Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[MAX:256] := 0
Performance
vpshufd
__m512i _mm512_mask_shuffle_epi32 (__m512i src, __mmask16 k, __m512i a, _MM_PERM_ENUM imm8)
Synopsis
__m512i _mm512_mask_shuffle_epi32 (__m512i src, __mmask16 k, __m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpshufd
__m512i _mm512_maskz_shuffle_epi32 (__mmask16 k, __m512i a, _MM_PERM_ENUM imm8)
Synopsis
__m512i _mm512_maskz_shuffle_epi32 (__mmask16 k, __m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(a[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(a[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(a[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(a[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(a[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(a[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(a[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(a[511:384], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpshufd
__m512i _mm512_shuffle_epi32 (__m512i a, _MM_PERM_ENUM imm8)
Synopsis
__m512i _mm512_shuffle_epi32 (__m512i a, _MM_PERM_ENUM imm8)
#include "immintrin.h"
Instruction: vpshufd zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[287:256] := SELECT4(a[383:256], imm8[1:0])
dst[319:288] := SELECT4(a[383:256], imm8[3:2])
dst[351:320] := SELECT4(a[383:256], imm8[5:4])
dst[383:352] := SELECT4(a[383:256], imm8[7:6])
dst[415:384] := SELECT4(a[511:384], imm8[1:0])
dst[447:416] := SELECT4(a[511:384], imm8[3:2])
dst[479:448] := SELECT4(a[511:384], imm8[5:4])
dst[511:480] := SELECT4(a[511:384], imm8[7:6])
dst[MAX:512] := 0
vpshufb
__m128i _mm_mask_shuffle_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_shuffle_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpshufb
__m128i _mm_maskz_shuffle_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_shuffle_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pshufb
__m128i _mm_shuffle_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_shuffle_epi8 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: pshufb mm, mm
CPUID Flags: SSSE3
Description
Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
ENDFOR
vpshufb
__m256i _mm256_mask_shuffle_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_shuffle_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpshufb
__m256i _mm256_maskz_shuffle_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_shuffle_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpshufb
__m256i _mm256_shuffle_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_shuffle_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpshufb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Shuffle 8-bit integers in a within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
IF b[128+i+7] == 1
dst[128+i+7:i] := 0
ELSE
index[3:0] := b[128+i+3:128+i]
dst[128+i+7:i] := a[128+index*8+7:128+index*8]
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpshufb
__m512i _mm512_mask_shuffle_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_shuffle_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512BW
Description
Shuffle 8-bit integers in a within 128-bit lanes using the control in the corresponding 8-bit element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpshufb
__m512i _mm512_maskz_shuffle_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_shuffle_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512BW
Description
Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpshufb
__m512i _mm512_shuffle_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_shuffle_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpshufb
CPUID Flags: AVX512BW
Description
Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[3:0] := b[i+3:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
ENDFOR
dst[MAX:512] := 0
vshuff32x4
__m256 _mm256_mask_shuffle_f32x4 (__m256 src, __mmask8 k, __m256 a, __m256 b, const int imm8)
Synopsis
__m256 _mm256_mask_shuffle_f32x4 (__m256 src, __mmask8 k, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT2(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vshuff32x4
__m256 _mm256_maskz_shuffle_f32x4 (__mmask8 k, __m256 a, __m256 b, const int imm8)
Synopsis
__m256 _mm256_maskz_shuffle_f32x4 (__mmask8 k, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT2(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vshuff32x4
__m256 _mm256_shuffle_f32x4 (__m256 a, __m256 b, const int imm8)
Synopsis
__m256 _mm256_shuffle_f32x4 (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT2(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
dst[127:0] := SELECT2(a[255:0], imm8[0])
dst[255:128] := SELECT2(b[255:0], imm8[1])
dst[MAX:256] := 0
vshuff32x4
__m512 _mm512_mask_shuffle_f32x4 (__m512 src, __mmask16 k, __m512 a, __m512 b, const int imm8)
Synopsis
__m512 _mm512_mask_shuffle_f32x4 (__m512 src, __mmask16 k, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vshuff32x4
__m512 _mm512_maskz_shuffle_f32x4 (__mmask16 k, __m512 a, __m512 b, const int imm8)
Synopsis
__m512 _mm512_maskz_shuffle_f32x4 (__mmask16 k, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vshuff32x4
__m512 _mm512_shuffle_f32x4 (__m512 a, __m512 b, const int imm8)
Synopsis
__m512 _mm512_shuffle_f32x4 (__m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshuff32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0
vshuff64x2
__m256d _mm256_mask_shuffle_f64x2 (__m256d src, __mmask8 k, __m256d a, __m256d b, const int imm8)
Synopsis
__m256d _mm256_mask_shuffle_f64x2 (__m256d src, __mmask8 k, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vshuff64x2
__m256d _mm256_maskz_shuffle_f64x2 (__mmask8 k, __m256d a, __m256d b, const int imm8)
Synopsis
__m256d _mm256_maskz_shuffle_f64x2 (__mmask8 k, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT2(a[255:0], imm8[0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[1])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vshuff64x2
__m256d _mm256_shuffle_f64x2 (__m256d a, __m256d b, const int imm8)
Synopsis
__m256d _mm256_shuffle_f64x2 (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
dst[127:0] := SELECT2(a[255:0], imm8[0])
dst[255:128] := SELECT2(b[255:0], imm8[1])
dst[MAX:256] := 0
vshuff64x2
__m512d _mm512_mask_shuffle_f64x2 (__m512d src, __mmask8 k, __m512d a, __m512d b, const int imm8)
Synopsis
__m512d _mm512_mask_shuffle_f64x2 (__m512d src, __mmask8 k, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vshuff64x2
__m512d _mm512_maskz_shuffle_f64x2 (__mmask8 k, __m512d a, __m512d b, const int imm8)
Synopsis
__m512d _mm512_maskz_shuffle_f64x2 (__mmask8 k, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vshuff64x2
__m512d _mm512_shuffle_f64x2 (__m512d a, __m512d b, const int imm8)
Synopsis
__m512d _mm512_shuffle_f64x2 (__m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshuff64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0
vshufi32x4
__m256i _mm256_mask_shuffle_i32x4 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_mask_shuffle_i32x4 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT2(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vshufi32x4
__m256i _mm256_maskz_shuffle_i32x4 (__mmask8 k, __m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_maskz_shuffle_i32x4 (__mmask8 k, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT2(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vshufi32x4
__m256i _mm256_shuffle_i32x4 (__m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_shuffle_i32x4 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT2(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
dst[127:0] := SELECT2(a[255:0], imm8[1:0])
dst[255:128] := SELECT2(b[255:0], imm8[3:2])
dst[MAX:256] := 0
vshufi32x4
__m512i _mm512_mask_shuffle_i32x4 (__m512i src, __mmask16 k, __m512i a, __m512i b, const int imm8)
Synopsis
__m512i _mm512_mask_shuffle_i32x4 (__m512i src, __mmask16 k, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vshufi32x4
__m512i _mm512_maskz_shuffle_i32x4 (__mmask16 k, __m512i a, __m512i b, const int imm8)
Synopsis
__m512i _mm512_maskz_shuffle_i32x4 (__mmask16 k, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vshufi32x4
__m512i _mm512_shuffle_i32x4 (__m512i a, __m512i b, const int imm8)
Synopsis
__m512i _mm512_shuffle_i32x4 (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi32x4 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0
vshufi64x2
__m256i _mm256_mask_shuffle_i64x2 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_mask_shuffle_i64x2 (__m256i src, __mmask8 k, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT2(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vshufi64x2
__m256i _mm256_maskz_shuffle_i64x2 (__mmask8 k, __m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_maskz_shuffle_i64x2 (__mmask8 k, __m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT2(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT2(a[255:0], imm8[1:0])
tmp_dst[255:128] := SELECT2(b[255:0], imm8[3:2])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vshufi64x2
__m256i _mm256_shuffle_i64x2 (__m256i a, __m256i b, const int imm8)
Synopsis
__m256i _mm256_shuffle_i64x2 (__m256i a, __m256i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT2(src, control){
CASE(control[0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
ESAC
RETURN tmp[127:0]
}
dst[127:0] := SELECT2(a[255:0], imm8[1:0])
dst[255:128] := SELECT2(b[255:0], imm8[3:2])
dst[MAX:256] := 0
vshufi64x2
__m512i _mm512_mask_shuffle_i64x2 (__m512i src, __mmask8 k, __m512i a, __m512i b, const int imm8)
Synopsis
__m512i _mm512_mask_shuffle_i64x2 (__m512i src, __mmask8 k, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vshufi64x2
__m512i _mm512_maskz_shuffle_i64x2 (__mmask8 k, __m512i a, __m512i b, const int imm8)
Synopsis
__m512i _mm512_maskz_shuffle_i64x2 (__mmask8 k, __m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
tmp_dst[127:0] := SELECT4(a[511:0], imm8[1:0])
tmp_dst[255:128] := SELECT4(a[511:0], imm8[3:2])
tmp_dst[383:256] := SELECT4(b[511:0], imm8[5:4])
tmp_dst[511:384] := SELECT4(b[511:0], imm8[7:6])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vshufi64x2
__m512i _mm512_shuffle_i64x2 (__m512i a, __m512i b, const int imm8)
Synopsis
__m512i _mm512_shuffle_i64x2 (__m512i a, __m512i b, const int imm8)
#include "immintrin.h"
Instruction: vshufi64x2 zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[127:0] := src[127:0]
1: tmp[127:0] := src[255:128]
2: tmp[127:0] := src[383:256]
3: tmp[127:0] := src[511:384]
ESAC
RETURN tmp[127:0]
}
dst[127:0] := SELECT4(a[511:0], imm8[1:0])
dst[255:128] := SELECT4(a[511:0], imm8[3:2])
dst[383:256] := SELECT4(b[511:0], imm8[5:4])
dst[511:384] := SELECT4(b[511:0], imm8[7:6])
dst[MAX:512] := 0
vshufpd
__m128d _mm_mask_shuffle_pd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8)
Synopsis
__m128d _mm_mask_shuffle_pd (__m128d src, __mmask8 k, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vshufpd
__m128d _mm_maskz_shuffle_pd (__mmask8 k, __m128d a, __m128d b, const int imm8)
Synopsis
__m128d _mm_maskz_shuffle_pd (__mmask8 k, __m128d a, __m128d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
shufpd
__m128d _mm_shuffle_pd (__m128d a, __m128d b, int imm8)
Synopsis
__m128d _mm_shuffle_pd (__m128d a, __m128d b, int imm8)
#include "emmintrin.h"
Instruction: shufpd xmm, xmm, imm
CPUID Flags: SSE2
Description
Shuffle double-precision (64-bit) floating-point elements using the control in imm8, and store the results in dst.
Operation
dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
vshufpd
__m256d _mm256_mask_shuffle_pd (__m256d src, __mmask8 k, __m256d a, __m256d b, const int imm8)
Synopsis
__m256d _mm256_mask_shuffle_pd (__m256d src, __mmask8 k, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vshufpd
__m256d _mm256_maskz_shuffle_pd (__mmask8 k, __m256d a, __m256d b, const int imm8)
Synopsis
__m256d _mm256_maskz_shuffle_pd (__mmask8 k, __m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vshufpd
__m256d _mm256_shuffle_pd (__m256d a, __m256d b, const int imm8)
Synopsis
__m256d _mm256_shuffle_pd (__m256d a, __m256d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
dst[MAX:256] := 0
Performance
vshufpd
__m512d _mm512_mask_shuffle_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, const int imm8)
Synopsis
__m512d _mm512_mask_shuffle_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vshufpd
__m512d _mm512_maskz_shuffle_pd (__mmask8 k, __m512d a, __m512d b, const int imm8)
Synopsis
__m512d _mm512_maskz_shuffle_pd (__mmask8 k, __m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
tmp_dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
tmp_dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
tmp_dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
tmp_dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
tmp_dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
tmp_dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
tmp_dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vshufpd
__m512d _mm512_shuffle_pd (__m512d a, __m512d b, const int imm8)
Synopsis
__m512d _mm512_shuffle_pd (__m512d a, __m512d b, const int imm8)
#include "immintrin.h"
Instruction: vshufpd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
dst[191:128] := (imm8[2] == 0) ? a[191:128] : a[255:192]
dst[255:192] := (imm8[3] == 0) ? b[191:128] : b[255:192]
dst[319:256] := (imm8[4] == 0) ? a[319:256] : a[383:320]
dst[383:320] := (imm8[5] == 0) ? b[319:256] : b[383:320]
dst[447:384] := (imm8[6] == 0) ? a[447:384] : a[511:448]
dst[511:448] := (imm8[7] == 0) ? b[447:384] : b[511:448]
dst[MAX:512] := 0
pshufw
__m64 _mm_shuffle_pi16 (__m64 a, int imm8)
Synopsis
__m64 _mm_shuffle_pi16 (__m64 a, int imm8)
#include "xmmintrin.h"
Instruction: pshufw mm, mm, imm
CPUID Flags: SSE
Description
Shuffle 16-bit integers in a using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[15:0] := src[15:0]
1: tmp[15:0] := src[31:16]
2: tmp[15:0] := src[47:32]
3: tmp[15:0] := src[63:48]
ESAC
RETURN tmp[15:0]
}
dst[15:0] := SELECT4(a[63:0], imm8[1:0])
dst[31:16] := SELECT4(a[63:0], imm8[3:2])
dst[47:32] := SELECT4(a[63:0], imm8[5:4])
dst[63:48] := SELECT4(a[63:0], imm8[7:6])
pshufb
__m64 _mm_shuffle_pi8 (__m64 a, __m64 b)
Synopsis
__m64 _mm_shuffle_pi8 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: pshufb mm, mm
CPUID Flags: SSSE3
Description
Shuffle packed 8-bit integers in a according to shuffle control mask in the corresponding 8-bit element of b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*8
IF b[i+7] == 1
dst[i+7:i] := 0
ELSE
index[2:0] := b[i+2:i]
dst[i+7:i] := a[index*8+7:index*8]
FI
ENDFOR
vshufps
__m128 _mm_mask_shuffle_ps (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_mask_shuffle_ps (__m128 src, __mmask8 k, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vshufps
__m128 _mm_maskz_shuffle_ps (__mmask8 k, __m128 a, __m128 b, const int imm8)
Synopsis
__m128 _mm_maskz_shuffle_ps (__mmask8 k, __m128 a, __m128 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
shufps
__m128 _mm_shuffle_ps (__m128 a, __m128 b, unsigned int imm8)
Synopsis
__m128 _mm_shuffle_ps (__m128 a, __m128 b, unsigned int imm8)
#include "xmmintrin.h"
Instruction: shufps xmm, xmm, imm
CPUID Flags: SSE
Description
Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(b[127:0], imm8[5:4])
dst[127:96] := SELECT4(b[127:0], imm8[7:6])
Performance
vshufps
__m256 _mm256_mask_shuffle_ps (__m256 src, __mmask8 k, __m256 a, __m256 b, const int imm8)
Synopsis
__m256 _mm256_mask_shuffle_ps (__m256 src, __mmask8 k, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vshufps
__m256 _mm256_maskz_shuffle_ps (__mmask8 k, __m256 a, __m256 b, const int imm8)
Synopsis
__m256 _mm256_maskz_shuffle_ps (__mmask8 k, __m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps
CPUID Flags: AVX512VL + AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vshufps
__m256 _mm256_shuffle_ps (__m256 a, __m256 b, const int imm8)
Synopsis
__m256 _mm256_shuffle_ps (__m256 a, __m256 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps ymm, ymm, ymm, imm
CPUID Flags: AVX
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(b[127:0], imm8[5:4])
dst[127:96] := SELECT4(b[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(b[255:128], imm8[5:4])
dst[255:224] := SELECT4(b[255:128], imm8[7:6])
dst[MAX:256] := 0
Performance
vshufps
__m512 _mm512_mask_shuffle_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, const int imm8)
Synopsis
__m512 _mm512_mask_shuffle_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vshufps
__m512 _mm512_maskz_shuffle_ps (__mmask16 k, __m512 a, __m512 b, const int imm8)
Synopsis
__m512 _mm512_maskz_shuffle_ps (__mmask16 k, __m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
tmp_dst[31:0] := SELECT4(a[127:0], imm8[1:0])
tmp_dst[63:32] := SELECT4(a[127:0], imm8[3:2])
tmp_dst[95:64] := SELECT4(b[127:0], imm8[5:4])
tmp_dst[127:96] := SELECT4(b[127:0], imm8[7:6])
tmp_dst[159:128] := SELECT4(a[255:128], imm8[1:0])
tmp_dst[191:160] := SELECT4(a[255:128], imm8[3:2])
tmp_dst[223:192] := SELECT4(b[255:128], imm8[5:4])
tmp_dst[255:224] := SELECT4(b[255:128], imm8[7:6])
tmp_dst[287:256] := SELECT4(a[383:256], imm8[1:0])
tmp_dst[319:288] := SELECT4(a[383:256], imm8[3:2])
tmp_dst[351:320] := SELECT4(b[383:256], imm8[5:4])
tmp_dst[383:352] := SELECT4(b[383:256], imm8[7:6])
tmp_dst[415:384] := SELECT4(a[511:384], imm8[1:0])
tmp_dst[447:416] := SELECT4(a[511:384], imm8[3:2])
tmp_dst[479:448] := SELECT4(b[511:384], imm8[5:4])
tmp_dst[511:480] := SELECT4(b[511:384], imm8[7:6])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vshufps
__m512 _mm512_shuffle_ps (__m512 a, __m512 b, const int imm8)
Synopsis
__m512 _mm512_shuffle_ps (__m512 a, __m512 b, const int imm8)
#include "immintrin.h"
Instruction: vshufps zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
Operation
SELECT4(src, control){
CASE(control[1:0])
0: tmp[31:0] := src[31:0]
1: tmp[31:0] := src[63:32]
2: tmp[31:0] := src[95:64]
3: tmp[31:0] := src[127:96]
ESAC
RETURN tmp[31:0]
}
dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(b[127:0], imm8[5:4])
dst[127:96] := SELECT4(b[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(b[255:128], imm8[5:4])
dst[255:224] := SELECT4(b[255:128], imm8[7:6])
dst[287:256] := SELECT4(a[383:256], imm8[1:0])
dst[319:288] := SELECT4(a[383:256], imm8[3:2])
dst[351:320] := SELECT4(b[383:256], imm8[5:4])
dst[383:352] := SELECT4(b[383:256], imm8[7:6])
dst[415:384] := SELECT4(a[511:384], imm8[1:0])
dst[447:416] := SELECT4(a[511:384], imm8[3:2])
dst[479:448] := SELECT4(b[511:384], imm8[5:4])
dst[511:480] := SELECT4(b[511:384], imm8[7:6])
dst[MAX:512] := 0
vpshufhw
__m128i _mm_mask_shufflehi_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)
Synopsis
__m128i _mm_mask_shufflehi_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. Store the results in the high 64 bits of dst, with the low 64 bits being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpshufhw
__m128i _mm_maskz_shufflehi_epi16 (__mmask8 k, __m128i a, int imm8)
Synopsis
__m128i _mm_maskz_shufflehi_epi16 (__mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. Store the results in the high 64 bits of dst, with the low 64 bits being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pshufhw
__m128i _mm_shufflehi_epi16 (__m128i a, int imm8)
Synopsis
__m128i _mm_shufflehi_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pshufhw xmm, xmm, imm
CPUID Flags: SSE2
Description
Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. Store the results in the high 64 bits of dst, with the low 64 bits being copied from from a to dst.
Operation
dst[63:0] := a[63:0]
dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
Performance
vpshufhw
__m256i _mm256_mask_shufflehi_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)
Synopsis
__m256i _mm256_mask_shufflehi_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpshufhw
__m256i _mm256_maskz_shufflehi_epi16 (__mmask16 k, __m256i a, int imm8)
Synopsis
__m256i _mm256_maskz_shufflehi_epi16 (__mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpshufhw
__m256i _mm256_shufflehi_epi16 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_shufflehi_epi16 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpshufhw ymm, ymm, imm
CPUID Flags: AVX2
Description
Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst.
Operation
dst[63:0] := a[63:0]
dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
dst[191:128] := a[191:128]
dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
dst[MAX:256] := 0
Performance
vpshufhw
__m512i _mm512_mask_shufflehi_epi16 (__m512i src, __mmask32 k, __m512i a, int imm8)
Synopsis
__m512i _mm512_mask_shufflehi_epi16 (__m512i src, __mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
tmp_dst[319:256] := a[319:256]
tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320]
tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320]
tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320]
tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320]
tmp_dst[447:384] := a[447:384]
tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448]
tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448]
tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448]
tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448]
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpshufhw
__m512i _mm512_maskz_shufflehi_epi16 (__mmask32 k, __m512i a, int imm8)
Synopsis
__m512i _mm512_maskz_shufflehi_epi16 (__mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
tmp_dst[319:256] := a[319:256]
tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320]
tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320]
tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320]
tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320]
tmp_dst[447:384] := a[447:384]
tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448]
tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448]
tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448]
tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448]
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpshufhw
__m512i _mm512_shufflehi_epi16 (__m512i a, int imm8)
Synopsis
__m512i _mm512_shufflehi_epi16 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshufhw
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the high 64 bits of 128-bit lanes of dst, with the low 64 bits of 128-bit lanes being copied from from a to dst.
Operation
dst[63:0] := a[63:0]
dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
dst[191:128] := a[191:128]
dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
dst[319:256] := a[319:256]
dst[335:320] := (a >> (imm8[1:0] * 16))[335:320]
dst[351:336] := (a >> (imm8[3:2] * 16))[335:320]
dst[367:352] := (a >> (imm8[5:4] * 16))[335:320]
dst[383:368] := (a >> (imm8[7:6] * 16))[335:320]
dst[447:384] := a[447:384]
dst[463:448] := (a >> (imm8[1:0] * 16))[463:448]
dst[479:464] := (a >> (imm8[3:2] * 16))[463:448]
dst[495:480] := (a >> (imm8[5:4] * 16))[463:448]
dst[511:496] := (a >> (imm8[7:6] * 16))[463:448]
dst[MAX:512] := 0
vpshuflw
__m128i _mm_mask_shufflelo_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)
Synopsis
__m128i _mm_mask_shufflelo_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. Store the results in the low 64 bits of dst, with the high 64 bits being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpshuflw
__m128i _mm_maskz_shufflelo_epi16 (__mmask8 k, __m128i a, int imm8)
Synopsis
__m128i _mm_maskz_shufflelo_epi16 (__mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. Store the results in the low 64 bits of dst, with the high 64 bits being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pshuflw
__m128i _mm_shufflelo_epi16 (__m128i a, int imm8)
Synopsis
__m128i _mm_shufflelo_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pshuflw xmm, xmm, imm
CPUID Flags: SSE2
Description
Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. Store the results in the low 64 bits of dst, with the high 64 bits being copied from from a to dst.
Operation
dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
dst[127:64] := a[127:64]
Performance
vpshuflw
__m256i _mm256_mask_shufflelo_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)
Synopsis
__m256i _mm256_mask_shufflelo_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpshuflw
__m256i _mm256_maskz_shufflelo_epi16 (__mmask16 k, __m256i a, int imm8)
Synopsis
__m256i _mm256_maskz_shufflelo_epi16 (__mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512VL + AVX512BW
Description
Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpshuflw
__m256i _mm256_shufflelo_epi16 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_shufflelo_epi16 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpshuflw ymm, ymm, imm
CPUID Flags: AVX2
Description
Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst.
Operation
dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
dst[127:64] := a[127:64]
dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
dst[255:192] := a[255:192]
dst[MAX:256] := 0
Performance
vpshuflw
__m512i _mm512_mask_shufflelo_epi16 (__m512i src, __mmask32 k, __m512i a, int imm8)
Synopsis
__m512i _mm512_mask_shufflelo_epi16 (__m512i src, __mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]
tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256]
tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256]
tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256]
tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256]
tmp_dst[383:320] := a[383:320]
tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384]
tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384]
tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384]
tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384]
tmp_dst[511:448] := a[511:448]
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpshuflw
__m512i _mm512_maskz_shufflelo_epi16 (__mmask32 k, __m512i a, int imm8)
Synopsis
__m512i _mm512_maskz_shufflelo_epi16 (__mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst, using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]
tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256]
tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256]
tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256]
tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256]
tmp_dst[383:320] := a[383:320]
tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384]
tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384]
tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384]
tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384]
tmp_dst[511:448] := a[511:448]
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpshuflw
__m512i _mm512_shufflelo_epi16 (__m512i a, int imm8)
Synopsis
__m512i _mm512_shufflelo_epi16 (__m512i a, int imm8)
#include "immintrin.h"
Instruction: vpshuflw
CPUID Flags: AVX512BW
Description
Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of a using the control in imm8. Store the results in the low 64 bits of 128-bit lanes of dst, with the high 64 bits of 128-bit lanes being copied from from a to dst.
Operation
dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
dst[127:64] := a[127:64]
dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
dst[255:192] := a[255:192]
dst[271:256] := (a >> (imm8[1:0] * 16))[271:256]
dst[287:272] := (a >> (imm8[3:2] * 16))[271:256]
dst[303:288] := (a >> (imm8[5:4] * 16))[271:256]
dst[319:304] := (a >> (imm8[7:6] * 16))[271:256]
dst[383:320] := a[383:320]
dst[399:384] := (a >> (imm8[1:0] * 16))[399:384]
dst[415:400] := (a >> (imm8[3:2] * 16))[399:384]
dst[431:416] := (a >> (imm8[5:4] * 16))[399:384]
dst[447:432] := (a >> (imm8[7:6] * 16))[399:384]
dst[511:448] := a[511:448]
dst[MAX:512] := 0
psignw
__m128i _mm_sign_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sign_epi16 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: psignw xmm, xmm
CPUID Flags: SSSE3
Description
Negate packed 16-bit integers in a when the corresponding signed 16-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.
Operation
FOR j := 0 to 7
i := j*16
IF b[i+15:i] < 0
dst[i+15:i] := NEG(a[i+15:i])
ELSE IF b[i+15:i] = 0
dst[i+15:i] := 0
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
Performance
vpsignw
__m256i _mm256_sign_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_sign_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsignw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Negate packed 16-bit integers in a when the corresponding signed 16-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.
Operation
FOR j := 0 to 15
i := j*16
IF b[i+15:i] < 0
dst[i+15:i] := NEG(a[i+15:i])
ELSE IF b[i+15:i] = 0
dst[i+15:i] := 0
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
psignd
__m128i _mm_sign_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sign_epi32 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: psignd xmm, xmm
CPUID Flags: SSSE3
Description
Negate packed 32-bit integers in a when the corresponding signed 32-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.
Operation
FOR j := 0 to 3
i := j*32
IF b[i+31:i] < 0
dst[i+31:i] := NEG(a[i+31:i])
ELSE IF b[i+31:i] = 0
dst[i+31:i] := 0
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
Performance
vpsignd
__m256i _mm256_sign_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_sign_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsignd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Negate packed 32-bit integers in a when the corresponding signed 32-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.
Operation
FOR j := 0 to 7
i := j*32
IF b[i+31:i] < 0
dst[i+31:i] := NEG(a[i+31:i])
ELSE IF b[i+31:i] = 0
dst[i+31:i] := 0
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
psignb
__m128i _mm_sign_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sign_epi8 (__m128i a, __m128i b)
#include "tmmintrin.h"
Instruction: psignb xmm, xmm
CPUID Flags: SSSE3
Description
Negate packed 8-bit integers in a when the corresponding signed 8-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.
Operation
FOR j := 0 to 15
i := j*8
IF b[i+7:i] < 0
dst[i+7:i] := NEG(a[i+7:i])
ELSE IF b[i+7:i] = 0
dst[i+7:i] := 0
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
Performance
vpsignb
__m256i _mm256_sign_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_sign_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsignb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Negate packed 8-bit integers in a when the corresponding signed 8-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.
Operation
FOR j := 0 to 31
i := j*8
IF b[i+7:i] < 0
dst[i+7:i] := NEG(a[i+7:i])
ELSE IF b[i+7:i] = 0
dst[i+7:i] := 0
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
Performance
psignw
__m64 _mm_sign_pi16 (__m64 a, __m64 b)
Synopsis
__m64 _mm_sign_pi16 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: psignw mm, mm
CPUID Flags: SSSE3
Description
Negate packed 16-bit integers in a when the corresponding signed 16-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.
Operation
FOR j := 0 to 3
i := j*16
IF b[i+15:i] < 0
dst[i+15:i] := NEG(a[i+15:i])
ELSE IF b[i+15:i] = 0
dst[i+15:i] := 0
ELSE
dst[i+15:i] := a[i+15:i]
FI
ENDFOR
psignd
__m64 _mm_sign_pi32 (__m64 a, __m64 b)
Synopsis
__m64 _mm_sign_pi32 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: psignd mm, mm
CPUID Flags: SSSE3
Description
Negate packed 32-bit integers in a when the corresponding signed 32-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.
Operation
FOR j := 0 to 1
i := j*32
IF b[i+31:i] < 0
dst[i+31:i] := NEG(a[i+31:i])
ELSE IF b[i+31:i] = 0
dst[i+31:i] := 0
ELSE
dst[i+31:i] := a[i+31:i]
FI
ENDFOR
psignb
__m64 _mm_sign_pi8 (__m64 a, __m64 b)
Synopsis
__m64 _mm_sign_pi8 (__m64 a, __m64 b)
#include "tmmintrin.h"
Instruction: psignb mm, mm
CPUID Flags: SSSE3
Description
Negate packed 8-bit integers in a when the corresponding signed 8-bit integer in b is negative, and store the results in dst. Element in dst are zeroed out when the corresponding element in b is zero.
Operation
FOR j := 0 to 7
i := j*8
IF b[i+7:i] < 0
dst[i+7:i] := NEG(a[i+7:i])
ELSE IF b[i+7:i] = 0
dst[i+7:i] := 0
ELSE
dst[i+7:i] := a[i+7:i]
FI
ENDFOR
...
__m128d _mm_sin_pd (__m128d a)
Synopsis
__m128d _mm_sin_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := SIN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_sin_pd (__m256d a)
Synopsis
__m256d _mm256_sin_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := SIN(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_sin_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_sin_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SIN(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_sin_pd (__m512d a)
Synopsis
__m512d _mm512_sin_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SIN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_sin_ps (__m128 a)
Synopsis
__m128 _mm_sin_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SIN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_sin_ps (__m256 a)
Synopsis
__m256 _mm256_sin_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SIN(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_sin_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_sin_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SIN(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_sin_ps (__m512 a)
Synopsis
__m512 _mm512_sin_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SIN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_sincos_pd (__m128d * mem_addr, __m128d a)
Synopsis
__m128d _mm_sincos_pd (__m128d * mem_addr, __m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, store the sine in dst, and store the cosine into memory at mem_addr.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := SIN(a[i+63:i])
MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_sincos_pd (__m256d * mem_addr, __m256d a)
Synopsis
__m256d _mm256_sincos_pd (__m256d * mem_addr, __m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in a expressed in radians, store the sine in dst, and store the cosine into memory at mem_addr.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := SIN(a[i+63:i])
MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_sincos_pd (__m512d * cos_res, __m512d sin_src, __m512d cos_src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_sincos_pd (__m512d * cos_res, __m512d sin_src, __m512d cos_src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in a and stores the results of the sine computation in dst and the results of the cosine computation in cos_res. Elements are written to their respective locations using writemask k (elements are copied from sin_src or cos_src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SIN(a[i+63:i])
cos_res[i+63:i] := COS(a[i+63:i])
ELSE
dst[i+63:i] := sin_src[i+63:i]
cos_res[i+63:i] := cos_src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0
...
__m512d _mm512_sincos_pd (__m512d * cos_res, __m512d a)
Synopsis
__m512d _mm512_sincos_pd (__m512d * cos_res, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Computes the sine and cosine of the packed double-precision (64-bit) floating-point elements in a and stores the results of the sine computation in dst and the results of the cosine computation in cos_res.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SIN(a[i+63:i])
cos_res[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0
...
__m128 _mm_sincos_ps (__m128 * mem_addr, __m128 a)
Synopsis
__m128 _mm_sincos_ps (__m128 * mem_addr, __m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, store the sine in dst, and store the cosine into memory at mem_addr.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SIN(a[i+31:i])
MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_sincos_ps (__m256 * mem_addr, __m256 a)
Synopsis
__m256 _mm256_sincos_ps (__m256 * mem_addr, __m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in a expressed in radians, store the sine in dst, and store the cosine into memory at mem_addr.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SIN(a[i+31:i])
MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_sincos_ps (__m512 * cos_res, __m512 sin_src, __m512 cos_src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_sincos_ps (__m512 * cos_res, __m512 sin_src, __m512 cos_src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in a and stores the results of the sine computation in dst and the results of the cosine computation in cos_res. Elements are written to their respective locations using writemask k (elements are copied from sin_src or cos_src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SIN(a[i+31:i])
cos_res[i+31:i] := COS(a[i+31:i])
ELSE
dst[i+31:i] := sin_src[i+31:i]
cos_res[i+31:i] := cos_src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0
...
__m512 _mm512_sincos_ps (__m512 * cos_res, __m512 a)
Synopsis
__m512 _mm512_sincos_ps (__m512 * cos_res, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Computes the sine and cosine of the packed single-precision (32-bit) floating-point elements in a and stores the results of the sine computation in dst and the results of the cosine computation in cos_res.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SIN(a[i+31:i])
cos_res[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
cos_res[MAX:512] := 0
...
__m128d _mm_sind_pd (__m128d a)
Synopsis
__m128d _mm_sind_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := SIND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_sind_pd (__m256d a)
Synopsis
__m256d _mm256_sind_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := SIND(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_sind_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_sind_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SIND(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_sind_pd (__m512d a)
Synopsis
__m512d _mm512_sind_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the sine of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SIND(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_sind_ps (__m128 a)
Synopsis
__m128 _mm_sind_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SIND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_sind_ps (__m256 a)
Synopsis
__m256 _mm256_sind_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SIND(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_sind_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_sind_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SIND(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_sind_ps (__m512 a)
Synopsis
__m512 _mm512_sind_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the sine of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SIND(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_sinh_pd (__m128d a)
Synopsis
__m128d _mm_sinh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := SINH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_sinh_pd (__m256d a)
Synopsis
__m256d _mm256_sinh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := SINH(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_sinh_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_sinh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SINH(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_sinh_pd (__m512d a)
Synopsis
__m512d _mm512_sinh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SINH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_sinh_ps (__m128 a)
Synopsis
__m128 _mm_sinh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SINH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_sinh_ps (__m256 a)
Synopsis
__m256 _mm256_sinh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SINH(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_sinh_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_sinh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SINH(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_sinh_ps (__m512 a)
Synopsis
__m512 _mm512_sinh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SINH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpsllw
__m128i _mm_mask_sll_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_sll_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsllw
__m128i _mm_maskz_sll_epi16 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_sll_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psllw
__m128i _mm_sll_epi16 (__m128i a, __m128i count)
Synopsis
__m128i _mm_sll_epi16 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psllw xmm, xmm
CPUID Flags: SSE2
Description
Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
FI
ENDFOR
Performance
vpsllw
__m256i _mm256_mask_sll_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_mask_sll_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsllw
__m256i _mm256_maskz_sll_epi16 (__mmask16 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_maskz_sll_epi16 (__mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsllw
__m256i _mm256_sll_epi16 (__m256i a, __m128i count)
Synopsis
__m256i _mm256_sll_epi16 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw ymm, ymm, xmm
CPUID Flags: AVX2
Description
Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsllw
__m512i _mm512_mask_sll_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_mask_sll_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsllw
__m512i _mm512_maskz_sll_epi16 (__mmask32 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_maskz_sll_epi16 (__mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsllw
__m512i _mm512_sll_epi16 (__m512i a, __m128i count)
Synopsis
__m512i _mm512_sll_epi16 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a left by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
FI
ENDFOR
dst[MAX:512] := 0
vpslld
__m128i _mm_mask_sll_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_sll_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpslld
__m128i _mm_maskz_sll_epi32 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_sll_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pslld
__m128i _mm_sll_epi32 (__m128i a, __m128i count)
Synopsis
__m128i _mm_sll_epi32 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: pslld xmm, xmm
CPUID Flags: SSE2
Description
Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
FI
ENDFOR
Performance
vpslld
__m256i _mm256_mask_sll_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_mask_sll_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpslld
__m256i _mm256_maskz_sll_epi32 (__mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_maskz_sll_epi32 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpslld
__m256i _mm256_sll_epi32 (__m256i a, __m128i count)
Synopsis
__m256i _mm256_sll_epi32 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld ymm, ymm, xmm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpslld
__m512i _mm512_mask_sll_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_mask_sll_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpslld
__m512i _mm512_maskz_sll_epi32 (__mmask16 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_maskz_sll_epi32 (__mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpslld
__m512i _mm512_sll_epi32 (__m512i a, __m128i count)
Synopsis
__m512i _mm512_sll_epi32 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsllq
__m128i _mm_mask_sll_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_sll_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsllq
__m128i _mm_maskz_sll_epi64 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_sll_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psllq
__m128i _mm_sll_epi64 (__m128i a, __m128i count)
Synopsis
__m128i _mm_sll_epi64 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psllq xmm, xmm
CPUID Flags: SSE2
Description
Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
FI
ENDFOR
Performance
vpsllq
__m256i _mm256_mask_sll_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_mask_sll_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsllq
__m256i _mm256_maskz_sll_epi64 (__mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_maskz_sll_epi64 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsllq
__m256i _mm256_sll_epi64 (__m256i a, __m128i count)
Synopsis
__m256i _mm256_sll_epi64 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq ymm, ymm, xmm
CPUID Flags: AVX2
Description
Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsllq
__m512i _mm512_mask_sll_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_mask_sll_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsllq
__m512i _mm512_maskz_sll_epi64 (__mmask8 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_maskz_sll_epi64 (__mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsllq
__m512i _mm512_sll_epi64 (__m512i a, __m128i count)
Synopsis
__m512i _mm512_sll_epi64 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsllw
__m128i _mm_mask_slli_epi16 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_mask_slli_epi16 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsllw
__m128i _mm_maskz_slli_epi16 (__mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_maskz_slli_epi16 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psllw
__m128i _mm_slli_epi16 (__m128i a, int imm8)
Synopsis
__m128i _mm_slli_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psllw xmm, imm
CPUID Flags: SSE2
Description
Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
FI
ENDFOR
Performance
vpsllw
__m256i _mm256_mask_slli_epi16 (__m256i src, __mmask16 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_mask_slli_epi16 (__m256i src, __mmask16 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsllw
__m256i _mm256_maskz_slli_epi16 (__mmask16 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_maskz_slli_epi16 (__mmask16 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsllw
__m256i _mm256_slli_epi16 (__m256i a, int imm8)
Synopsis
__m256i _mm256_slli_epi16 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsllw ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsllw
__m512i _mm512_mask_slli_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_mask_slli_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsllw
__m512i _mm512_maskz_slli_epi16 (__mmask32 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_maskz_slli_epi16 (__mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsllw
__m512i _mm512_slli_epi16 (__m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_slli_epi16 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
FI
ENDFOR
dst[MAX:512] := 0
vpslld
__m128i _mm_mask_slli_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_mask_slli_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpslld
__m128i _mm_maskz_slli_epi32 (__mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_maskz_slli_epi32 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
pslld
__m128i _mm_slli_epi32 (__m128i a, int imm8)
Synopsis
__m128i _mm_slli_epi32 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pslld xmm, imm
CPUID Flags: SSE2
Description
Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
FI
ENDFOR
Performance
vpslld
__m256i _mm256_mask_slli_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_mask_slli_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpslld
__m256i _mm256_maskz_slli_epi32 (__mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_maskz_slli_epi32 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpslld
__m256i _mm256_slli_epi32 (__m256i a, int imm8)
Synopsis
__m256i _mm256_slli_epi32 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpslld ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpslld
__m512i _mm512_mask_slli_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_mask_slli_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpslld
__m512i _mm512_maskz_slli_epi32 (__mmask16 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_maskz_slli_epi32 (__mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpslld
__m512i _mm512_slli_epi32 (__m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_slli_epi32 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpslld zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsllq
__m128i _mm_mask_slli_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_mask_slli_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsllq
__m128i _mm_maskz_slli_epi64 (__mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_maskz_slli_epi64 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psllq
__m128i _mm_slli_epi64 (__m128i a, int imm8)
Synopsis
__m128i _mm_slli_epi64 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psllq xmm, imm
CPUID Flags: SSE2
Description
Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
FI
ENDFOR
Performance
vpsllq
__m256i _mm256_mask_slli_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_mask_slli_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsllq
__m256i _mm256_maskz_slli_epi64 (__mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_maskz_slli_epi64 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsllq
__m256i _mm256_slli_epi64 (__m256i a, int imm8)
Synopsis
__m256i _mm256_slli_epi64 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsllq ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsllq
__m512i _mm512_mask_slli_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_mask_slli_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsllq
__m512i _mm512_maskz_slli_epi64 (__mmask8 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_maskz_slli_epi64 (__mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsllq
__m512i _mm512_slli_epi64 (__m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_slli_epi64 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsllq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
FI
ENDFOR
dst[MAX:512] := 0
pslldq
__m128i _mm_slli_si128 (__m128i a, int imm8)
Synopsis
__m128i _mm_slli_si128 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: pslldq xmm, imm
CPUID Flags: SSE2
Description
Shift a left by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
Performance
vpslldq
__m256i _mm256_slli_si256 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_slli_si256 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpslldq ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift 128-bit lanes in a left by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
dst[255:128] := a[255:128] << (tmp*8)
dst[MAX:256] := 0
Performance
vpsllvw
__m128i _mm_mask_sllv_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_sllv_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsllvw
__m128i _mm_maskz_sllv_epi16 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_sllv_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsllvw
__m128i _mm_sllv_epi16 (__m128i a, __m128i count)
Synopsis
__m128i _mm_sllv_epi16 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsllvw
__m256i _mm256_mask_sllv_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_mask_sllv_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsllvw
__m256i _mm256_maskz_sllv_epi16 (__mmask16 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_maskz_sllv_epi16 (__mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsllvw
__m256i _mm256_sllv_epi16 (__m256i a, __m256i count)
Synopsis
__m256i _mm256_sllv_epi16 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsllvw
__m512i _mm512_mask_sllv_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_mask_sllv_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsllvw
__m512i _mm512_maskz_sllv_epi16 (__mmask32 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_maskz_sllv_epi16 (__mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsllvw
__m512i _mm512_sllv_epi16 (__m512i a, __m512i count)
Synopsis
__m512i _mm512_sllv_epi16 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsllvd
__m128i _mm_mask_sllv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_sllv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsllvd
__m128i _mm_maskz_sllv_epi32 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_sllv_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsllvd
__m128i _mm_sllv_epi32 (__m128i a, __m128i count)
Synopsis
__m128i _mm_sllv_epi32 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvd xmm, xmm, xmm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ENDFOR
dst[MAX:128] := 0
Performance
vpsllvd
__m256i _mm256_mask_sllv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_mask_sllv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsllvd
__m256i _mm256_maskz_sllv_epi32 (__mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_maskz_sllv_epi32 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsllvd
__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
Synopsis
__m256i _mm256_sllv_epi32 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpsllvd
__m512i _mm512_mask_sllv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_mask_sllv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsllvd
__m512i _mm512_maskz_sllv_epi32 (__mmask16 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_maskz_sllv_epi32 (__mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsllvd
__m512i _mm512_sllv_epi32 (__m512i a, __m512i count)
Synopsis
__m512i _mm512_sllv_epi32 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpsllvq
__m128i _mm_mask_sllv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_sllv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsllvq
__m128i _mm_maskz_sllv_epi64 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_sllv_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsllvq
__m128i _mm_sllv_epi64 (__m128i a, __m128i count)
Synopsis
__m128i _mm_sllv_epi64 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsllvq xmm, xmm, xmm
CPUID Flags: AVX2
Description
Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ENDFOR
dst[MAX:128] := 0
Performance
vpsllvq
__m256i _mm256_mask_sllv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_mask_sllv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsllvq
__m256i _mm256_maskz_sllv_epi64 (__mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_maskz_sllv_epi64 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsllvq
__m256i _mm256_sllv_epi64 (__m256i a, __m256i count)
Synopsis
__m256i _mm256_sllv_epi64 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsllvq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpsllvq
__m512i _mm512_mask_sllv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_mask_sllv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsllvq
__m512i _mm512_maskz_sllv_epi64 (__mmask8 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_maskz_sllv_epi64 (__mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsllvq
__m512i _mm512_sllv_epi64 (__m512i a, __m512i count)
Synopsis
__m512i _mm512_sllv_epi64 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsllvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ENDFOR
dst[MAX:512] := 0
spflt
void _mm_spflt_32 (unsigned int r1)
Synopsis
void _mm_spflt_32 (unsigned int r1)
#include "immintrin.h"
Instruction: spflt r
CPUID Flags: KNCNI
Description
Set performance monitoring filtering mask to 32-bit unsigned integer r1.
Operation
SetPerfMonMask(r1[31:0])
spflt
void _mm_spflt_64 (unsigned __int64 r1)
Synopsis
void _mm_spflt_64 (unsigned __int64 r1)
#include "immintrin.h"
Instruction: spflt r
CPUID Flags: KNCNI
Description
Set performance monitoring filtering mask to 64-bit unsigned integer r1.
Operation
SetPerfMonMask(r1[63:0])
vsqrtpd
__m128d _mm_mask_sqrt_pd (__m128d src, __mmask8 k, __m128d a)
Synopsis
__m128d _mm_mask_sqrt_pd (__m128d src, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vsqrtpd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := SQRT(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vsqrtpd
__m128d _mm_maskz_sqrt_pd (__mmask8 k, __m128d a)
Synopsis
__m128d _mm_maskz_sqrt_pd (__mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vsqrtpd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := SQRT(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
sqrtpd
__m128d _mm_sqrt_pd (__m128d a)
Synopsis
__m128d _mm_sqrt_pd (__m128d a)
#include "emmintrin.h"
Instruction: sqrtpd xmm, xmm
CPUID Flags: SSE2
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
Performance
vsqrtpd
__m256d _mm256_mask_sqrt_pd (__m256d src, __mmask8 k, __m256d a)
Synopsis
__m256d _mm256_mask_sqrt_pd (__m256d src, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vsqrtpd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := SQRT(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vsqrtpd
__m256d _mm256_maskz_sqrt_pd (__mmask8 k, __m256d a)
Synopsis
__m256d _mm256_maskz_sqrt_pd (__mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vsqrtpd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := SQRT(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vsqrtpd
__m256d _mm256_sqrt_pd (__m256d a)
Synopsis
__m256d _mm256_sqrt_pd (__m256d a)
#include "immintrin.h"
Instruction: vsqrtpd ymm, ymm
CPUID Flags: AVX
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
vsqrtpd
__m512d _mm512_mask_sqrt_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_sqrt_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SQRT(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_maskz_sqrt_pd (__mmask8 k, __m512d a)
Synopsis
__m512d _mm512_maskz_sqrt_pd (__mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SQRT(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_sqrt_pd (__m512d a)
Synopsis
__m512d _mm512_sqrt_pd (__m512d a)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vsqrtps
__m128 _mm_mask_sqrt_ps (__m128 src, __mmask8 k, __m128 a)
Synopsis
__m128 _mm_mask_sqrt_ps (__m128 src, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vsqrtps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := SQRT(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vsqrtps
__m128 _mm_maskz_sqrt_ps (__mmask8 k, __m128 a)
Synopsis
__m128 _mm_maskz_sqrt_ps (__mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vsqrtps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := SQRT(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
sqrtps
__m128 _mm_sqrt_ps (__m128 a)
Synopsis
__m128 _mm_sqrt_ps (__m128 a)
#include "xmmintrin.h"
Instruction: sqrtps xmm, xmm
CPUID Flags: SSE
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
Performance
vsqrtps
__m256 _mm256_mask_sqrt_ps (__m256 src, __mmask8 k, __m256 a)
Synopsis
__m256 _mm256_mask_sqrt_ps (__m256 src, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vsqrtps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := SQRT(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vsqrtps
__m256 _mm256_maskz_sqrt_ps (__mmask8 k, __m256 a)
Synopsis
__m256 _mm256_maskz_sqrt_ps (__mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vsqrtps
CPUID Flags: AVX512VL + AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := SQRT(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vsqrtps
__m256 _mm256_sqrt_ps (__m256 a)
Synopsis
__m256 _mm256_sqrt_ps (__m256 a)
#include "immintrin.h"
Instruction: vsqrtps ymm, ymm
CPUID Flags: AVX
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vsqrtps
__m512 _mm512_mask_sqrt_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_sqrt_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SQRT(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vsqrtps
__m512 _mm512_maskz_sqrt_ps (__mmask16 k, __m512 a)
Synopsis
__m512 _mm512_maskz_sqrt_ps (__mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SQRT(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vsqrtps
__m512 _mm512_sqrt_ps (__m512 a)
Synopsis
__m512 _mm512_sqrt_ps (__m512 a)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm
CPUID Flags: AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_mask_sqrt_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
Synopsis
__m512d _mm512_mask_sqrt_round_pd (__m512d src, __mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SQRT(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_maskz_sqrt_round_pd (__mmask8 k, __m512d a, int rounding)
Synopsis
__m512d _mm512_maskz_sqrt_round_pd (__mmask8 k, __m512d a, int rounding)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SQRT(a[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vsqrtpd
__m512d _mm512_sqrt_round_pd (__m512d a, int rounding)
Synopsis
__m512d _mm512_sqrt_round_pd (__m512d a, int rounding)
#include "immintrin.h"
Instruction: vsqrtpd zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
vsqrtps
__m512 _mm512_mask_sqrt_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_mask_sqrt_round_ps (__m512 src, __mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SQRT(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vsqrtps
__m512 _mm512_maskz_sqrt_round_ps (__mmask16 k, __m512 a, int rounding)
Synopsis
__m512 _mm512_maskz_sqrt_round_ps (__mmask16 k, __m512 a, int rounding)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SQRT(a[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vsqrtps
__m512 _mm512_sqrt_round_ps (__m512 a, int rounding)
Synopsis
__m512 _mm512_sqrt_round_ps (__m512 a, int rounding)
#include "immintrin.h"
Instruction: vsqrtps zmm {k}, zmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vsqrtsd
__m128d _mm_mask_sqrt_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_sqrt_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := SQRT(a[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vsqrtsd
__m128d _mm_maskz_sqrt_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_sqrt_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := SQRT(a[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vsqrtsd
__m128d _mm_sqrt_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_sqrt_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst, and copy the upper element from
b to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := SQRT(a[63:0])
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vsqrtss
__m128 _mm_mask_sqrt_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_sqrt_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := SQRT(a[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vsqrtss
__m128 _mm_maskz_sqrt_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_sqrt_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := SQRT(a[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vsqrtss
__m128 _mm_sqrt_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_sqrt_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Compute the square root of the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst, and copy the upper 3 packed elements from
b to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := SQRT(a[31:0])
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vsqrtsd
__m128d _mm_mask_sqrt_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_sqrt_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the square root of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Operation
IF k[0]
dst[63:0] := SQRT(a[63:0])
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
vsqrtsd
__m128d _mm_maskz_sqrt_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_sqrt_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsqrtsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the square root of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from b to the upper element of dst.
Operation
IF k[0]
dst[63:0] := SQRT(a[63:0])
ELSE
dst[63:0] := 0
FI
dst[127:64] := b[127:64]
dst[MAX:128] := 0
sqrtsd
__m128d _mm_sqrt_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_sqrt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: sqrtsd xmm, xmm
CPUID Flags: SSE2
Description
Compute the square root of the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from b to the upper element of dst.
Operation
dst[63:0] := SQRT(a[63:0])
dst[127:64] := b[127:64]
Performance
vsqrtss
__m128 _mm_mask_sqrt_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_sqrt_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := SQRT(a[31:0])
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
vsqrtss
__m128 _mm_maskz_sqrt_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_sqrt_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsqrtss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Compute the square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from b to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := SQRT(a[31:0])
ELSE
dst[31:0] := 0
FI
dst[127:32] := b[127:32]
dst[MAX:128] := 0
sqrtss
__m128 _mm_sqrt_ss (__m128 a)
Synopsis
__m128 _mm_sqrt_ss (__m128 a)
#include "xmmintrin.h"
Instruction: sqrtss xmm, xmm
CPUID Flags: SSE
Description
Compute the square root of the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := SQRT(a[31:0])
dst[127:32] := a[127:32]
Performance
vpsraw
__m128i _mm_mask_sra_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_sra_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsraw
__m128i _mm_maskz_sra_epi16 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_sra_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psraw
__m128i _mm_sra_epi16 (__m128i a, __m128i count)
Synopsis
__m128i _mm_sra_epi16 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psraw xmm, xmm
CPUID Flags: SSE2
Description
Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF count[63:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
FI
ENDFOR
Performance
vpsraw
__m256i _mm256_mask_sra_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_mask_sra_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsraw
__m256i _mm256_maskz_sra_epi16 (__mmask16 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_maskz_sra_epi16 (__mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsraw
__m256i _mm256_sra_epi16 (__m256i a, __m128i count)
Synopsis
__m256i _mm256_sra_epi16 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw ymm, ymm, xmm
CPUID Flags: AVX2
Description
Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF count[63:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsraw
__m512i _mm512_mask_sra_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_mask_sra_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsraw
__m512i _mm512_maskz_sra_epi16 (__mmask32 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_maskz_sra_epi16 (__mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsraw
__m512i _mm512_sra_epi16 (__m512i a, __m128i count)
Synopsis
__m512i _mm512_sra_epi16 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
IF count[63:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsrad
__m128i _mm_mask_sra_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_sra_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrad
__m128i _mm_maskz_sra_epi32 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_sra_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psrad
__m128i _mm_sra_epi32 (__m128i a, __m128i count)
Synopsis
__m128i _mm_sra_epi32 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psrad xmm, xmm
CPUID Flags: SSE2
Description
Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF count[63:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
FI
ENDFOR
Performance
vpsrad
__m256i _mm256_mask_sra_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_mask_sra_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrad
__m256i _mm256_maskz_sra_epi32 (__mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_maskz_sra_epi32 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrad
__m256i _mm256_sra_epi32 (__m256i a, __m128i count)
Synopsis
__m256i _mm256_sra_epi32 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad ymm, ymm, xmm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF count[63:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsrad
__m512i _mm512_mask_sra_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_mask_sra_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrad
__m512i _mm512_maskz_sra_epi32 (__mmask16 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_maskz_sra_epi32 (__mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrad
__m512i _mm512_sra_epi32 (__m512i a, __m128i count)
Synopsis
__m512i _mm512_sra_epi32 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF count[63:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsraq
__m128i _mm_mask_sra_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_sra_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsraq
__m128i _mm_maskz_sra_epi64 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_sra_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsraq
__m128i _mm_sra_epi64 (__m128i a, __m128i count)
Synopsis
__m128i _mm_sra_epi64 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF count[63:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
FI
ENDFOR
dst[MAX:128] := 0
vpsraq
__m256i _mm256_mask_sra_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_mask_sra_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsraq
__m256i _mm256_maskz_sra_epi64 (__mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_maskz_sra_epi64 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsraq
__m256i _mm256_sra_epi64 (__m256i a, __m128i count)
Synopsis
__m256i _mm256_sra_epi64 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF count[63:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
FI
ENDFOR
dst[MAX:256] := 0
vpsraq
__m512i _mm512_mask_sra_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_mask_sra_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsraq
__m512i _mm512_maskz_sra_epi64 (__mmask8 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_maskz_sra_epi64 (__mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsraq
__m512i _mm512_sra_epi64 (__m512i a, __m128i count)
Synopsis
__m512i _mm512_sra_epi64 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF count[63:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] >> count[63:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsraw
__m128i _mm_mask_srai_epi16 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_mask_srai_epi16 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsraw
__m128i _mm_maskz_srai_epi16 (__mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_maskz_srai_epi16 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psraw
__m128i _mm_srai_epi16 (__m128i a, int imm8)
Synopsis
__m128i _mm_srai_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psraw xmm, imm
CPUID Flags: SSE2
Description
Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF imm8[7:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
FI
ENDFOR
Performance
vpsraw
__m256i _mm256_mask_srai_epi16 (__m256i src, __mmask16 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_mask_srai_epi16 (__m256i src, __mmask16 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsraw
__m256i _mm256_maskz_srai_epi16 (__mmask16 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_maskz_srai_epi16 (__mmask16 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsraw
__m256i _mm256_srai_epi16 (__m256i a, int imm8)
Synopsis
__m256i _mm256_srai_epi16 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsraw ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF imm8[7:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsraw
__m512i _mm512_mask_srai_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_mask_srai_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsraw
__m512i _mm512_maskz_srai_epi16 (__mmask32 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_maskz_srai_epi16 (__mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsraw
__m512i _mm512_srai_epi16 (__m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_srai_epi16 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
IF imm8[7:0] > 15
dst[i+15:i] := SignBit
ELSE
dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsrad
__m128i _mm_mask_srai_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_mask_srai_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrad
__m128i _mm_maskz_srai_epi32 (__mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_maskz_srai_epi32 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psrad
__m128i _mm_srai_epi32 (__m128i a, int imm8)
Synopsis
__m128i _mm_srai_epi32 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrad xmm, imm
CPUID Flags: SSE2
Description
Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF imm8[7:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
FI
ENDFOR
Performance
vpsrad
__m256i _mm256_mask_srai_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_mask_srai_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrad
__m256i _mm256_maskz_srai_epi32 (__mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_maskz_srai_epi32 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrad
__m256i _mm256_srai_epi32 (__m256i a, int imm8)
Synopsis
__m256i _mm256_srai_epi32 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrad ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF imm8[7:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsrad
__m512i _mm512_mask_srai_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_mask_srai_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrad
__m512i _mm512_maskz_srai_epi32 (__mmask16 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_maskz_srai_epi32 (__mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrad
__m512i _mm512_srai_epi32 (__m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_srai_epi32 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrad zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF imm8[7:0] > 31
dst[i+31:i] := SignBit
ELSE
dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsraq
__m128i _mm_mask_srai_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_mask_srai_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsraq
__m128i _mm_maskz_srai_epi64 (__mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_maskz_srai_epi64 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsraq
__m128i _mm_srai_epi64 (__m128i a, unsigned int imm8)
Synopsis
__m128i _mm_srai_epi64 (__m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF imm8[7:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
FI
ENDFOR
dst[MAX:128] := 0
vpsraq
__m256i _mm256_mask_srai_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_mask_srai_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsraq
__m256i _mm256_maskz_srai_epi64 (__mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_maskz_srai_epi64 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsraq
__m256i _mm256_srai_epi64 (__m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_srai_epi64 (__m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF imm8[7:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
FI
ENDFOR
dst[MAX:256] := 0
vpsraq
__m512i _mm512_mask_srai_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_mask_srai_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsraq
__m512i _mm512_maskz_srai_epi64 (__mmask8 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_maskz_srai_epi64 (__mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsraq
__m512i _mm512_srai_epi64 (__m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_srai_epi64 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsraq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF imm8[7:0] > 63
dst[i+63:i] := SignBit
ELSE
dst[i+63:i] := SignExtend(a[i+63:i] << imm8[7:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsravw
__m128i _mm_mask_srav_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_srav_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsravw
__m128i _mm_maskz_srav_epi16 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_srav_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsravw
__m128i _mm_srav_epi16 (__m128i a, __m128i count)
Synopsis
__m128i _mm_srav_epi16 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ENDFOR
dst[MAX:128] := 0
vpsravw
__m256i _mm256_mask_srav_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_mask_srav_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsravw
__m256i _mm256_maskz_srav_epi16 (__mmask16 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_maskz_srav_epi16 (__mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsravw
__m256i _mm256_srav_epi16 (__m256i a, __m256i count)
Synopsis
__m256i _mm256_srav_epi16 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ENDFOR
dst[MAX:256] := 0
vpsravw
__m512i _mm512_mask_srav_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_mask_srav_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsravw
__m512i _mm512_maskz_srav_epi16 (__mmask32 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_maskz_srav_epi16 (__mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsravw
__m512i _mm512_srav_epi16 (__m512i a, __m512i count)
Synopsis
__m512i _mm512_srav_epi16 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ENDFOR
dst[MAX:512] := 0
vpsravd
__m128i _mm_mask_srav_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_srav_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsravd
__m128i _mm_maskz_srav_epi32 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_srav_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsravd
__m128i _mm_srav_epi32 (__m128i a, __m128i count)
Synopsis
__m128i _mm_srav_epi32 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravd xmm, xmm, xmm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:128] := 0
Performance
vpsravd
__m256i _mm256_mask_srav_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_mask_srav_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsravd
__m256i _mm256_maskz_srav_epi32 (__mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_maskz_srav_epi32 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsravd
__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
Synopsis
__m256i _mm256_srav_epi32 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpsravd
__m512i _mm512_mask_srav_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_mask_srav_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsravd
__m512i _mm512_maskz_srav_epi32 (__mmask16 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_maskz_srav_epi32 (__mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsravd
__m512i _mm512_srav_epi32 (__m512i a, __m512i count)
Synopsis
__m512i _mm512_srav_epi32 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpsravq
__m128i _mm_mask_srav_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_srav_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsravq
__m128i _mm_maskz_srav_epi64 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_srav_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsravq
__m128i _mm_srav_epi64 (__m128i a, __m128i count)
Synopsis
__m128i _mm_srav_epi64 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:128] := 0
vpsravq
__m256i _mm256_mask_srav_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_mask_srav_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsravq
__m256i _mm256_maskz_srav_epi64 (__mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_maskz_srav_epi64 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsravq
__m256i _mm256_srav_epi64 (__m256i a, __m256i count)
Synopsis
__m256i _mm256_srav_epi64 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsravq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:256] := 0
vpsravq
__m512i _mm512_mask_srav_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_mask_srav_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsravq
__m512i _mm512_maskz_srav_epi64 (__mmask8 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_maskz_srav_epi64 (__mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsravq
__m512i _mm512_srav_epi64 (__m512i a, __m512i count)
Synopsis
__m512i _mm512_srav_epi64 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsravq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := SignExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:512] := 0
vpsrlw
__m128i _mm_mask_srl_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_srl_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrlw
__m128i _mm_maskz_srl_epi16 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_srl_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psrlw
__m128i _mm_srl_epi16 (__m128i a, __m128i count)
Synopsis
__m128i _mm_srl_epi16 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psrlw xmm, xmm
CPUID Flags: SSE2
Description
Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
FI
ENDFOR
Performance
vpsrlw
__m256i _mm256_mask_srl_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_mask_srl_epi16 (__m256i src, __mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrlw
__m256i _mm256_maskz_srl_epi16 (__mmask16 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_maskz_srl_epi16 (__mmask16 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrlw
__m256i _mm256_srl_epi16 (__m256i a, __m128i count)
Synopsis
__m256i _mm256_srl_epi16 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw ymm, ymm, xmm
CPUID Flags: AVX2
Description
Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsrlw
__m512i _mm512_mask_srl_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_mask_srl_epi16 (__m512i src, __mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrlw
__m512i _mm512_maskz_srl_epi16 (__mmask32 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_maskz_srl_epi16 (__mmask32 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrlw
__m512i _mm512_srl_epi16 (__m512i a, __m128i count)
Synopsis
__m512i _mm512_srl_epi16 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
IF count[63:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsrld
__m128i _mm_mask_srl_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_srl_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrld
__m128i _mm_maskz_srl_epi32 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_srl_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psrld
__m128i _mm_srl_epi32 (__m128i a, __m128i count)
Synopsis
__m128i _mm_srl_epi32 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psrld xmm, xmm
CPUID Flags: SSE2
Description
Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
FI
ENDFOR
Performance
vpsrld
__m256i _mm256_mask_srl_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_mask_srl_epi32 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrld
__m256i _mm256_maskz_srl_epi32 (__mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_maskz_srl_epi32 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrld
__m256i _mm256_srl_epi32 (__m256i a, __m128i count)
Synopsis
__m256i _mm256_srl_epi32 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld ymm, ymm, xmm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsrld
__m512i _mm512_mask_srl_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_mask_srl_epi32 (__m512i src, __mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrld
__m512i _mm512_maskz_srl_epi32 (__mmask16 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_maskz_srl_epi32 (__mmask16 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrld
__m512i _mm512_srl_epi32 (__m512i a, __m128i count)
Synopsis
__m512i _mm512_srl_epi32 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF count[63:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsrlq
__m128i _mm_mask_srl_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_srl_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrlq
__m128i _mm_maskz_srl_epi64 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_srl_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psrlq
__m128i _mm_srl_epi64 (__m128i a, __m128i count)
Synopsis
__m128i _mm_srl_epi64 (__m128i a, __m128i count)
#include "emmintrin.h"
Instruction: psrlq xmm, xmm
CPUID Flags: SSE2
Description
Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
FI
ENDFOR
Performance
vpsrlq
__m256i _mm256_mask_srl_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_mask_srl_epi64 (__m256i src, __mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrlq
__m256i _mm256_maskz_srl_epi64 (__mmask8 k, __m256i a, __m128i count)
Synopsis
__m256i _mm256_maskz_srl_epi64 (__mmask8 k, __m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrlq
__m256i _mm256_srl_epi64 (__m256i a, __m128i count)
Synopsis
__m256i _mm256_srl_epi64 (__m256i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq ymm, ymm, xmm
CPUID Flags: AVX2
Description
Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsrlq
__m512i _mm512_mask_srl_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_mask_srl_epi64 (__m512i src, __mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrlq
__m512i _mm512_maskz_srl_epi64 (__mmask8 k, __m512i a, __m128i count)
Synopsis
__m512i _mm512_maskz_srl_epi64 (__mmask8 k, __m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrlq
__m512i _mm512_srl_epi64 (__m512i a, __m128i count)
Synopsis
__m512i _mm512_srl_epi64 (__m512i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, xmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF count[63:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsrlw
__m128i _mm_mask_srli_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)
Synopsis
__m128i _mm_mask_srli_epi16 (__m128i src, __mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrlw
__m128i _mm_maskz_srli_epi16 (__mmask8 k, __m128i a, int imm8)
Synopsis
__m128i _mm_maskz_srli_epi16 (__mmask8 k, __m128i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psrlw
__m128i _mm_srli_epi16 (__m128i a, int imm8)
Synopsis
__m128i _mm_srli_epi16 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrlw xmm, imm
CPUID Flags: SSE2
Description
Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
FI
ENDFOR
Performance
vpsrlw
__m256i _mm256_mask_srli_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)
Synopsis
__m256i _mm256_mask_srli_epi16 (__m256i src, __mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrlw
__m256i _mm256_maskz_srli_epi16 (__mmask16 k, __m256i a, int imm8)
Synopsis
__m256i _mm256_maskz_srli_epi16 (__mmask16 k, __m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrlw
__m256i _mm256_srli_epi16 (__m256i a, int imm8)
Synopsis
__m256i _mm256_srli_epi16 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsrlw
__m512i _mm512_mask_srli_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_mask_srli_epi16 (__m512i src, __mmask32 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrlw
__m512i _mm512_maskz_srli_epi16 (__mmask32 k, __m512i a, int imm8)
Synopsis
__m512i _mm512_maskz_srli_epi16 (__mmask32 k, __m512i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
FI
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrlw
__m512i _mm512_srli_epi16 (__m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_srli_epi16 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
IF imm8[7:0] > 15
dst[i+15:i] := 0
ELSE
dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsrld
__m128i _mm_mask_srli_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_mask_srli_epi32 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrld
__m128i _mm_maskz_srli_epi32 (__mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_maskz_srli_epi32 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psrld
__m128i _mm_srli_epi32 (__m128i a, int imm8)
Synopsis
__m128i _mm_srli_epi32 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrld xmm, imm
CPUID Flags: SSE2
Description
Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
FI
ENDFOR
Performance
vpsrld
__m256i _mm256_mask_srli_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_mask_srli_epi32 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrld
__m256i _mm256_maskz_srli_epi32 (__mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_maskz_srli_epi32 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrld
__m256i _mm256_srli_epi32 (__m256i a, int imm8)
Synopsis
__m256i _mm256_srli_epi32 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrld ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsrld
__m512i _mm512_mask_srli_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_mask_srli_epi32 (__m512i src, __mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrld
__m512i _mm512_maskz_srli_epi32 (__mmask16 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_maskz_srli_epi32 (__mmask16 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
FI
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrld
__m512i _mm512_srli_epi32 (__m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_srli_epi32 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrld zmm {k}, zmm, imm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
IF imm8[7:0] > 31
dst[i+31:i] := 0
ELSE
dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:512] := 0
vpsrlq
__m128i _mm_mask_srli_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_mask_srli_epi64 (__m128i src, __mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrlq
__m128i _mm_maskz_srli_epi64 (__mmask8 k, __m128i a, unsigned int imm8)
Synopsis
__m128i _mm_maskz_srli_epi64 (__mmask8 k, __m128i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psrlq
__m128i _mm_srli_epi64 (__m128i a, int imm8)
Synopsis
__m128i _mm_srli_epi64 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrlq xmm, imm
CPUID Flags: SSE2
Description
Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
FI
ENDFOR
Performance
vpsrlq
__m256i _mm256_mask_srli_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_mask_srli_epi64 (__m256i src, __mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrlq
__m256i _mm256_maskz_srli_epi64 (__mmask8 k, __m256i a, unsigned int imm8)
Synopsis
__m256i _mm256_maskz_srli_epi64 (__mmask8 k, __m256i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrlq
__m256i _mm256_srli_epi64 (__m256i a, int imm8)
Synopsis
__m256i _mm256_srli_epi64 (__m256i a, int imm8)
#include "immintrin.h"
Instruction: vpsrlq ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:256] := 0
Performance
vpsrlq
__m512i _mm512_mask_srli_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_mask_srli_epi64 (__m512i src, __mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
FI
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrlq
__m512i _mm512_maskz_srli_epi64 (__mmask8 k, __m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_maskz_srli_epi64 (__mmask8 k, __m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
FI
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrlq
__m512i _mm512_srli_epi64 (__m512i a, unsigned int imm8)
Synopsis
__m512i _mm512_srli_epi64 (__m512i a, unsigned int imm8)
#include "immintrin.h"
Instruction: vpsrlq zmm {k}, zmm, imm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
IF imm8[7:0] > 63
dst[i+63:i] := 0
ELSE
dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
FI
ENDFOR
dst[MAX:512] := 0
psrldq
__m128i _mm_srli_si128 (__m128i a, int imm8)
Synopsis
__m128i _mm_srli_si128 (__m128i a, int imm8)
#include "emmintrin.h"
Instruction: psrldq xmm, imm
CPUID Flags: SSE2
Description
Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] >> (tmp*8)
Performance
vpsrldq
__m256i _mm256_srli_si256 (__m256i a, const int imm8)
Synopsis
__m256i _mm256_srli_si256 (__m256i a, const int imm8)
#include "immintrin.h"
Instruction: vpsrldq ymm, ymm, imm
CPUID Flags: AVX2
Description
Shift 128-bit lanes in a right by imm8 bytes while shifting in zeros, and store the results in dst.
Operation
tmp := imm8[7:0]
IF tmp > 15
tmp := 16
FI
dst[127:0] := a[127:0] >> (tmp*8)
dst[255:128] := a[255:128] >> (tmp*8)
dst[MAX:256] := 0
Performance
vpsrlvw
__m128i _mm_mask_srlv_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_srlv_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrlvw
__m128i _mm_maskz_srlv_epi16 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_srlv_epi16 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsrlvw
__m128i _mm_srlv_epi16 (__m128i a, __m128i count)
Synopsis
__m128i _mm_srlv_epi16 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsrlvw
__m256i _mm256_mask_srlv_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_mask_srlv_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+63:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrlvw
__m256i _mm256_maskz_srlv_epi16 (__mmask16 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_maskz_srlv_epi16 (__mmask16 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrlvw
__m256i _mm256_srlv_epi16 (__m256i a, __m256i count)
Synopsis
__m256i _mm256_srlv_epi16 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512VL + AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrlvw
__m512i _mm512_mask_srlv_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_mask_srlv_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrlvw
__m512i _mm512_maskz_srlv_epi16 (__mmask32 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_maskz_srlv_epi16 (__mmask32 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrlvw
__m512i _mm512_srlv_epi16 (__m512i a, __m512i count)
Synopsis
__m512i _mm512_srlv_epi16 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvw
CPUID Flags: AVX512BW
Description
Shift packed 16-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrlvd
__m128i _mm_mask_srlv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_srlv_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrlvd
__m128i _mm_maskz_srlv_epi32 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_srlv_epi32 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsrlvd
__m128i _mm_srlv_epi32 (__m128i a, __m128i count)
Synopsis
__m128i _mm_srlv_epi32 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvd xmm, xmm, xmm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:128] := 0
Performance
vpsrlvd
__m256i _mm256_mask_srlv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_mask_srlv_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrlvd
__m256i _mm256_maskz_srlv_epi32 (__mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_maskz_srlv_epi32 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvd
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrlvd
__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
Synopsis
__m256i _mm256_srlv_epi32 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpsrlvd
__m512i _mm512_mask_srlv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_mask_srlv_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrlvd
__m512i _mm512_maskz_srlv_epi32 (__mmask16 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_maskz_srlv_epi32 (__mmask16 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrlvd
__m512i _mm512_srlv_epi32 (__m512i a, __m512i count)
Synopsis
__m512i _mm512_srlv_epi32 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpsrlvq
__m128i _mm_mask_srlv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_mask_srlv_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsrlvq
__m128i _mm_maskz_srlv_epi64 (__mmask8 k, __m128i a, __m128i count)
Synopsis
__m128i _mm_maskz_srlv_epi64 (__mmask8 k, __m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpsrlvq
__m128i _mm_srlv_epi64 (__m128i a, __m128i count)
Synopsis
__m128i _mm_srlv_epi64 (__m128i a, __m128i count)
#include "immintrin.h"
Instruction: vpsrlvq xmm, xmm, xmm
CPUID Flags: AVX2
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:128] := 0
Performance
vpsrlvq
__m256i _mm256_mask_srlv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_mask_srlv_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsrlvq
__m256i _mm256_maskz_srlv_epi64 (__mmask8 k, __m256i a, __m256i count)
Synopsis
__m256i _mm256_maskz_srlv_epi64 (__mmask8 k, __m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvq
CPUID Flags: AVX512VL + AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsrlvq
__m256i _mm256_srlv_epi64 (__m256i a, __m256i count)
Synopsis
__m256i _mm256_srlv_epi64 (__m256i a, __m256i count)
#include "immintrin.h"
Instruction: vpsrlvq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpsrlvq
__m512i _mm512_mask_srlv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_mask_srlv_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsrlvq
__m512i _mm512_maskz_srlv_epi64 (__mmask8 k, __m512i a, __m512i count)
Synopsis
__m512i _mm512_maskz_srlv_epi64 (__mmask8 k, __m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsrlvq
__m512i _mm512_srlv_epi64 (__m512i a, __m512i count)
Synopsis
__m512i _mm512_srlv_epi64 (__m512i a, __m512i count)
#include "immintrin.h"
Instruction: vpsrlvq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:512] := 0
vmovdqa32
void _mm_mask_store_epi32 (void* mem_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_store_epi32 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Store packed 32-bit integers from a into memory using writemask k.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovdqa32
void _mm256_mask_store_epi32 (void* mem_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_store_epi32 (void* mem_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa32
CPUID Flags: AVX512VL + AVX512F
Description
Store packed 32-bit integers from a into memory using writemask k.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovdqa32
void _mm512_mask_store_epi32 (void* mem_addr, __mmask16 k, __m512i a)
Synopsis
void _mm512_mask_store_epi32 (void* mem_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Store packed 32-bit integers from a into memory using writemask k.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovdqa32
void _mm512_store_epi32 (void* mem_addr, __m512i a)
Synopsis
void _mm512_store_epi32 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Store 512-bits (composed of 16 packed 32-bit integers) from a into memory.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
vmovdqa64
void _mm_mask_store_epi64 (void* mem_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_store_epi64 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Store packed 64-bit integers from a into memory using writemask k.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovdqa64
void _mm256_mask_store_epi64 (void* mem_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_store_epi64 (void* mem_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa64
CPUID Flags: AVX512VL + AVX512F
Description
Store packed 64-bit integers from a into memory using writemask k.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovdqa64
void _mm512_mask_store_epi64 (void* mem_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_store_epi64 (void* mem_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Store packed 64-bit integers from a into memory using writemask k.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovdqa64
void _mm512_store_epi64 (void* mem_addr, __m512i a)
Synopsis
void _mm512_store_epi64 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa64 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Store 512-bits (composed of 8 packed 64-bit integers) from a into memory.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
vmovapd
void _mm_mask_store_pd (void* mem_addr, __mmask8 k, __m128d a)
Synopsis
void _mm_mask_store_pd (void* mem_addr, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
movapd
void _mm_store_pd (double* mem_addr, __m128d a)
Synopsis
void _mm_store_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movapd m128, xmm
CPUID Flags: SSE2
Description
Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+127:mem_addr] := a[127:0]
vmovapd
void _mm256_mask_store_pd (void* mem_addr, __mmask8 k, __m256d a)
Synopsis
void _mm256_mask_store_pd (void* mem_addr, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovapd
CPUID Flags: AVX512VL + AVX512F
Description
Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovapd
void _mm256_store_pd (double * mem_addr, __m256d a)
Synopsis
void _mm256_store_pd (double * mem_addr, __m256d a)
#include "immintrin.h"
Instruction: vmovapd m256, ymm
CPUID Flags: AVX
Description
Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+255:mem_addr] := a[255:0]
vmovapd
void _mm512_mask_store_pd (void* mem_addr, __mmask8 k, __m512d a)
Synopsis
void _mm512_mask_store_pd (void* mem_addr, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovapd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovapd
void _mm512_store_pd (void* mem_addr, __m512d a)
Synopsis
void _mm512_store_pd (void* mem_addr, __m512d a)
#include "immintrin.h"
Instruction: vmovapd m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
...
void _mm_store_pd1 (double* mem_addr, __m128d a)
Synopsis
void _mm_store_pd1 (double* mem_addr, __m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Store the lower double-precision (64-bit) floating-point element from a into 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+63:mem_addr] := a[63:0]
MEM[mem_addr+127:mem_addr+64] := a[63:0]
vmovaps
void _mm_mask_store_ps (void* mem_addr, __mmask8 k, __m128 a)
Synopsis
void _mm_mask_store_ps (void* mem_addr, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
movaps
void _mm_store_ps (float* mem_addr, __m128 a)
Synopsis
void _mm_store_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movaps m128, xmm
CPUID Flags: SSE
Description
Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+127:mem_addr] := a[127:0]
vmovaps
void _mm256_mask_store_ps (void* mem_addr, __mmask8 k, __m256 a)
Synopsis
void _mm256_mask_store_ps (void* mem_addr, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovaps
CPUID Flags: AVX512VL + AVX512F
Description
Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovaps
void _mm256_store_ps (float * mem_addr, __m256 a)
Synopsis
void _mm256_store_ps (float * mem_addr, __m256 a)
#include "immintrin.h"
Instruction: vmovaps m256, ymm
CPUID Flags: AVX
Description
Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+255:mem_addr] := a[255:0]
vmovaps
void _mm512_mask_store_ps (void* mem_addr, __mmask16 k, __m512 a)
Synopsis
void _mm512_mask_store_ps (void* mem_addr, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovaps m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovaps
void _mm512_store_ps (void* mem_addr, __m512 a)
Synopsis
void _mm512_store_ps (void* mem_addr, __m512 a)
#include "immintrin.h"
Instruction: vmovaps m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
...
void _mm_store_ps1 (float* mem_addr, __m128 a)
Synopsis
void _mm_store_ps1 (float* mem_addr, __m128 a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Store the lower single-precision (32-bit) floating-point element from a into 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[31:0]
MEM[mem_addr+95:mem_addr+64] := a[31:0]
MEM[mem_addr+127:mem_addr+96] := a[31:0]
vmovsd
void _mm_mask_store_sd (double* mem_addr, __mmask8 k, __m128d a)
Synopsis
void _mm_mask_store_sd (double* mem_addr, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovsd m64 {k}, xmm
CPUID Flags: AVX512F
Description
Store the lower double-precision (64-bit) floating-point element from a into memory using writemask k.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
IF k[0]
MEM[mem_addr+63:mem_addr] := a[63:0]
FI
movsd
void _mm_store_sd (double* mem_addr, __m128d a)
Synopsis
void _mm_store_sd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movsd m64, xmm
CPUID Flags: SSE2
Description
Store the lower double-precision (64-bit) floating-point element from a into memory. mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+63:mem_addr] := a[63:0]
movdqa
void _mm_store_si128 (__m128i* mem_addr, __m128i a)
Synopsis
void _mm_store_si128 (__m128i* mem_addr, __m128i a)
#include "emmintrin.h"
Instruction: movdqa m128, xmm
CPUID Flags: SSE2
Description
Store 128-bits of integer data from a into memory.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+127:mem_addr] := a[127:0]
vmovdqa
void _mm256_store_si256 (__m256i * mem_addr, __m256i a)
Synopsis
void _mm256_store_si256 (__m256i * mem_addr, __m256i a)
#include "immintrin.h"
Instruction: vmovdqa m256, ymm
CPUID Flags: AVX
Description
Store 256-bits of integer data from a into memory.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+255:mem_addr] := a[255:0]
vmovdqa32
void _mm512_store_si512 (void* mem_addr, __m512i a)
Synopsis
void _mm512_store_si512 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovdqa32 m512 {k}, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Store 512-bits of integer data from a into memory.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
vmovss
void _mm_mask_store_ss (float* mem_addr, __mmask8 k, __m128 a)
Synopsis
void _mm_mask_store_ss (float* mem_addr, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovss m32 {k}, xmm
CPUID Flags: AVX512F
Description
Store the lower single-precision (32-bit) floating-point element from a into memory using writemask k.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
IF k[0]
MEM[mem_addr+31:mem_addr] := a[31:0]
FI
movss
void _mm_store_ss (float* mem_addr, __m128 a)
Synopsis
void _mm_store_ss (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movss m32, xmm
CPUID Flags: SSE
Description
Store the lower single-precision (32-bit) floating-point element from a into memory. mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+31:mem_addr] := a[31:0]
...
void _mm_store1_pd (double* mem_addr, __m128d a)
Synopsis
void _mm_store1_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Store the lower double-precision (64-bit) floating-point element from a into 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+63:mem_addr] := a[63:0]
MEM[mem_addr+127:mem_addr+64] := a[63:0]
...
void _mm_store1_ps (float* mem_addr, __m128 a)
Synopsis
void _mm_store1_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Store the lower single-precision (32-bit) floating-point element from a into 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[31:0]
MEM[mem_addr+95:mem_addr+64] := a[31:0]
MEM[mem_addr+127:mem_addr+96] := a[31:0]
...
void _storebe_i16 (void * ptr, short data)
Synopsis
void _storebe_i16 (void * ptr, short data)
#include "immintrin.h"
Description
Stores word-sized (16-bit) data to address ptr in big-endian format.
Operation
addr := MEM[ptr]
FOR j := 0 to 1
i := j*8
addr[i+7:i] := data[15-i:15-i-7]
ENDFOR
...
void _storebe_i32 (void * ptr, int data)
Synopsis
void _storebe_i32 (void * ptr, int data)
#include "immintrin.h"
Description
Stores double word-sized (32-bit) data to address ptr in big-endian format.
Operation
addr := MEM[ptr]
FOR j := 0 to 4
i := j*8
addr[i+7:i] := data[31-i:31-i-7]
ENDFOR
...
void _storebe_i64 (void * ptr, __int64 data)
Synopsis
void _storebe_i64 (void * ptr, __int64 data)
#include "immintrin.h"
Description
Stores quad word-sized (64-bit) data to address ptr in big-endian format.
Operation
addr := MEM[ptr]
FOR j := 0 to 7
i := j*8
addr[i+7:i] := data[63-i:63-i-7]
ENDFOR
movhpd
void _mm_storeh_pd (double* mem_addr, __m128d a)
Synopsis
void _mm_storeh_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movhpd m64, xmm
CPUID Flags: SSE2
Description
Store the upper double-precision (64-bit) floating-point element from a into memory.
Operation
MEM[mem_addr+63:mem_addr] := a[127:64]
movhps
void _mm_storeh_pi (__m64* mem_addr, __m128 a)
Synopsis
void _mm_storeh_pi (__m64* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movhps m64, xmm
CPUID Flags: SSE
Description
Store the upper 2 single-precision (32-bit) floating-point elements from a into memory.
Operation
MEM[mem_addr+31:mem_addr] := a[95:64]
MEM[mem_addr+63:mem_addr+32] := a[127:96]
movq
void _mm_storel_epi64 (__m128i* mem_addr, __m128i a)
Synopsis
void _mm_storel_epi64 (__m128i* mem_addr, __m128i a)
#include "emmintrin.h"
Instruction: movq m64, xmm
CPUID Flags: SSE2
Description
Store 64-bit integer from the first element of a into memory.
Operation
MEM[mem_addr+63:mem_addr] := a[63:0]
movlpd
void _mm_storel_pd (double* mem_addr, __m128d a)
Synopsis
void _mm_storel_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movlpd m64, xmm
CPUID Flags: SSE2
Description
Store the lower double-precision (64-bit) floating-point element from a into memory.
Operation
MEM[mem_addr+63:mem_addr] := a[63:0]
movlps
void _mm_storel_pi (__m64* mem_addr, __m128 a)
Synopsis
void _mm_storel_pi (__m64* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movlps m64, xmm
CPUID Flags: SSE
Description
Store the lower 2 single-precision (32-bit) floating-point elements from a into memory.
Operation
MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[63:32]
vmovnrapd
void _mm512_storenr_pd (void * mt, __m512d v)
Synopsis
void _mm512_storenr_pd (void * mt, __m512d v)
#include "immintrin.h"
Instruction: vmovnrapd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed double-precision (64-bit) floating-point elements from v to memory address mt with a no-read hint to the processor.
Operation
addr := MEM[mt]
FOR j := 0 to 7
i := j*64
addr[i+63:i] := v[i+63:i]
ENDFOR
vmovnraps
void _mm512_storenr_ps (void * mt, __m512 v)
Synopsis
void _mm512_storenr_ps (void * mt, __m512 v)
#include "immintrin.h"
Instruction: vmovnraps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed single-precision (32-bit) floating-point elements from v to memory address mt with a no-read hint to the processor.
Operation
addr := MEM[mt]
FOR j := 0 to 15
i := j*32
addr[i+31:i] := v[i+31:i]
ENDFOR
vmovnrngoapd
void _mm512_storenrngo_pd (void * mt, __m512d v)
Synopsis
void _mm512_storenrngo_pd (void * mt, __m512d v)
#include "immintrin.h"
Instruction: vmovnrngoapd m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed double-precision (64-bit) floating-point elements from v to memory address mt with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them).
Operation
addr := MEM[mt]
FOR j := 0 to 7
i := j*64
addr[i+63:i] := v[i+63:i]
ENDFOR
vmovnrngoaps
void _mm512_storenrngo_ps (void * mt, __m512 v)
Synopsis
void _mm512_storenrngo_ps (void * mt, __m512 v)
#include "immintrin.h"
Instruction: vmovnrngoaps m512 {k}, zmm
CPUID Flags: KNCNI
Description
Stores packed single-precision (32-bit) floating-point elements from v to memory address mt with a no-read hint and using a weakly-ordered memory consistency model (stores performed with this function are not globally ordered, and subsequent stores from the same thread can be observed before them).
Operation
addr := MEM[mt]
FOR j := 0 to 15
i := j*32
addr[i+31:i] := v[i+31:i]
ENDFOR
...
void _mm_storer_pd (double* mem_addr, __m128d a)
Synopsis
void _mm_storer_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
CPUID Flags: SSE2
Description
Store 2 double-precision (64-bit) floating-point elements from a into memory in reverse order.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+63:mem_addr] := a[127:64]
MEM[mem_addr+127:mem_addr+64] := a[63:0]
...
void _mm_storer_ps (float* mem_addr, __m128 a)
Synopsis
void _mm_storer_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movups m128, xmm
CPUID Flags: SSE
Description
Store 4 single-precision (32-bit) floating-point elements from a into memory in reverse order.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+31:mem_addr] := a[127:96]
MEM[mem_addr+63:mem_addr+32] := a[95:64]
MEM[mem_addr+95:mem_addr+64] := a[63:32]
MEM[mem_addr+127:mem_addr+96] := a[31:0]
vmovdqu16
void _mm_mask_storeu_epi16 (void* mem_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_storeu_epi16 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Store packed 16-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vmovdqu16
void _mm256_mask_storeu_epi16 (void* mem_addr, __mmask16 k, __m256i a)
Synopsis
void _mm256_mask_storeu_epi16 (void* mem_addr, __mmask16 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512VL + AVX512BW
Description
Store packed 16-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vmovdqu16
void _mm512_mask_storeu_epi16 (void* mem_addr, __mmask32 k, __m512i a)
Synopsis
void _mm512_mask_storeu_epi16 (void* mem_addr, __mmask32 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu16
CPUID Flags: AVX512BW
Description
Store packed 16-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
MEM[mem_addr+i+15:mem_addr+i] := a[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vmovdqu32
void _mm_mask_storeu_epi32 (void* mem_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_storeu_epi32 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F
Description
Store packed 32-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovdqu32
void _mm256_mask_storeu_epi32 (void* mem_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_storeu_epi32 (void* mem_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu32
CPUID Flags: AVX512VL + AVX512F
Description
Store packed 32-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovdqu32
void _mm512_mask_storeu_epi32 (void* mem_addr, __mmask16 k, __m512i a)
Synopsis
void _mm512_mask_storeu_epi32 (void* mem_addr, __mmask16 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu32 m512 {k}, zmm
CPUID Flags: AVX512F
Description
Store packed 32-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovdqu64
void _mm_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m128i a)
Synopsis
void _mm_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F
Description
Store packed 64-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovdqu64
void _mm256_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m256i a)
Synopsis
void _mm256_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu64
CPUID Flags: AVX512VL + AVX512F
Description
Store packed 64-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovdqu64
void _mm512_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m512i a)
Synopsis
void _mm512_mask_storeu_epi64 (void* mem_addr, __mmask8 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu64 m512 {k}, zmm
CPUID Flags: AVX512F
Description
Store packed 64-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovdqu8
void _mm_mask_storeu_epi8 (void* mem_addr, __mmask16 k, __m128i a)
Synopsis
void _mm_mask_storeu_epi8 (void* mem_addr, __mmask16 k, __m128i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Store packed 8-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
FI
ENDFOR
vmovdqu8
void _mm256_mask_storeu_epi8 (void* mem_addr, __mmask32 k, __m256i a)
Synopsis
void _mm256_mask_storeu_epi8 (void* mem_addr, __mmask32 k, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512VL + AVX512BW
Description
Store packed 8-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
FI
ENDFOR
vmovdqu8
void _mm512_mask_storeu_epi8 (void* mem_addr, __mmask64 k, __m512i a)
Synopsis
void _mm512_mask_storeu_epi8 (void* mem_addr, __mmask64 k, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu8
CPUID Flags: AVX512BW
Description
Store packed 8-bit integers from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
FI
ENDFOR
vmovupd
void _mm_mask_storeu_pd (void* mem_addr, __mmask8 k, __m128d a)
Synopsis
void _mm_mask_storeu_pd (void* mem_addr, __mmask8 k, __m128d a)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F
Description
Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
movupd
void _mm_storeu_pd (double* mem_addr, __m128d a)
Synopsis
void _mm_storeu_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movupd m128, xmm
CPUID Flags: SSE2
Description
Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory.
mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+127:mem_addr] := a[127:0]
vmovupd
void _mm256_mask_storeu_pd (void* mem_addr, __mmask8 k, __m256d a)
Synopsis
void _mm256_mask_storeu_pd (void* mem_addr, __mmask8 k, __m256d a)
#include "immintrin.h"
Instruction: vmovupd
CPUID Flags: AVX512VL + AVX512F
Description
Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovupd
void _mm256_storeu_pd (double * mem_addr, __m256d a)
Synopsis
void _mm256_storeu_pd (double * mem_addr, __m256d a)
#include "immintrin.h"
Instruction: vmovupd m256, ymm
CPUID Flags: AVX
Description
Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory.
mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+255:mem_addr] := a[255:0]
vmovupd
void _mm512_mask_storeu_pd (void* mem_addr, __mmask8 k, __m512d a)
Synopsis
void _mm512_mask_storeu_pd (void* mem_addr, __mmask8 k, __m512d a)
#include "immintrin.h"
Instruction: vmovupd m512 {k}, zmm
CPUID Flags: AVX512F
Description
Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
FI
ENDFOR
vmovupd
void _mm512_storeu_pd (void* mem_addr, __m512d a)
Synopsis
void _mm512_storeu_pd (void* mem_addr, __m512d a)
#include "immintrin.h"
Instruction: vmovupd m512 {k}, zmm
CPUID Flags: AVX512F
Description
Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory.
mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
vmovups
void _mm_mask_storeu_ps (void* mem_addr, __mmask8 k, __m128 a)
Synopsis
void _mm_mask_storeu_ps (void* mem_addr, __mmask8 k, __m128 a)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F
Description
Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
movups
void _mm_storeu_ps (float* mem_addr, __m128 a)
Synopsis
void _mm_storeu_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movups m128, xmm
CPUID Flags: SSE
Description
Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory.
mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+127:mem_addr] := a[127:0]
vmovups
void _mm256_mask_storeu_ps (void* mem_addr, __mmask8 k, __m256 a)
Synopsis
void _mm256_mask_storeu_ps (void* mem_addr, __mmask8 k, __m256 a)
#include "immintrin.h"
Instruction: vmovups
CPUID Flags: AVX512VL + AVX512F
Description
Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovups
void _mm256_storeu_ps (float * mem_addr, __m256 a)
Synopsis
void _mm256_storeu_ps (float * mem_addr, __m256 a)
#include "immintrin.h"
Instruction: vmovups m256, ymm
CPUID Flags: AVX
Description
Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory.
mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+255:mem_addr] := a[255:0]
vmovups
void _mm512_mask_storeu_ps (void* mem_addr, __mmask16 k, __m512 a)
Synopsis
void _mm512_mask_storeu_ps (void* mem_addr, __mmask16 k, __m512 a)
#include "immintrin.h"
Instruction: vmovups m512 {k}, zmm
CPUID Flags: AVX512F
Description
Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
mem_addr does not need to be aligned on any particular boundary.
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
FI
ENDFOR
vmovups
void _mm512_storeu_ps (void* mem_addr, __m512 a)
Synopsis
void _mm512_storeu_ps (void* mem_addr, __m512 a)
#include "immintrin.h"
Instruction: vmovups m512 {k}, zmm
CPUID Flags: AVX512F
Description
Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory.
mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
movdqu
void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
Synopsis
void _mm_storeu_si128 (__m128i* mem_addr, __m128i a)
#include "emmintrin.h"
Instruction: movdqu m128, xmm
CPUID Flags: SSE2
Description
Store 128-bits of integer data from a into memory.
mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+127:mem_addr] := a[127:0]
...
void _mm_storeu_si16 (void* mem_addr, __m128i a)
Synopsis
void _mm_storeu_si16 (void* mem_addr, __m128i a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Store 16-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+15:mem_addr] := a[15:0]
movd+movw
void _mm_storeu_si16 (void* mem_addr, __m128i a)
Synopsis
void _mm_storeu_si16 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movd+movw
Description
Store 16-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+15:mem_addr] := a[15:0]
vmovdqu
void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
Synopsis
void _mm256_storeu_si256 (__m256i * mem_addr, __m256i a)
#include "immintrin.h"
Instruction: vmovdqu m256, ymm
CPUID Flags: AVX
Description
Store 256-bits of integer data from a into memory.
mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+255:mem_addr] := a[255:0]
movd
void _mm_storeu_si32 (void* mem_addr, __m128i a)
Synopsis
void _mm_storeu_si32 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movd
Description
Store 32-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+31:mem_addr] := a[31:0]
movd
void _mm_storeu_si32 (void* mem_addr, __m128i a)
Synopsis
void _mm_storeu_si32 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movd m32, xmm
CPUID Flags: SSE
Description
Store 32-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+31:mem_addr] := a[31:0]
vmovdqu32
void _mm512_storeu_si512 (void* mem_addr, __m512i a)
Synopsis
void _mm512_storeu_si512 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovdqu32 m512 {k}, zmm
CPUID Flags: AVX512F
Description
Store 512-bits of integer data from a into memory.
mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
movq
void _mm_storeu_si64 (void* mem_addr, __m128i a)
Synopsis
void _mm_storeu_si64 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movq m64, xmm
CPUID Flags: SSE
Description
Store 64-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+63:mem_addr] := a[63:0]
movq
void _mm_storeu_si64 (void* mem_addr, __m128i a)
Synopsis
void _mm_storeu_si64 (void* mem_addr, __m128i a)
#include "immintrin.h"
Instruction: movq
Description
Store 64-bit integer from the first element of a into memory. mem_addr does not need to be aligned on any particular boundary.
Operation
MEM[mem_addr+63:mem_addr] := a[63:0]
...
void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a)
Synopsis
void _mm256_storeu2_m128 (float* hiaddr, float* loaddr, __m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Store the high and low 128-bit halves (each composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory two different 128-bit locations.
hiaddr and loaddr do not need to be aligned on any particular boundary.
Operation
MEM[loaddr+127:loaddr] := a[127:0]
MEM[hiaddr+127:hiaddr] := a[255:128]
...
void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a)
Synopsis
void _mm256_storeu2_m128d (double* hiaddr, double* loaddr, __m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Store the high and low 128-bit halves (each composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory two different 128-bit locations.
hiaddr and loaddr do not need to be aligned on any particular boundary.
Operation
MEM[loaddr+127:loaddr] := a[127:0]
MEM[hiaddr+127:hiaddr] := a[255:128]
...
void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a)
Synopsis
void _mm256_storeu2_m128i (__m128i* hiaddr, __m128i* loaddr, __m256i a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Store the high and low 128-bit halves (each composed of integer data) from a into memory two different 128-bit locations.
hiaddr and loaddr do not need to be aligned on any particular boundary.
Operation
MEM[loaddr+127:loaddr] := a[127:0]
MEM[hiaddr+127:hiaddr] := a[255:128]
movntdqa
__m128i _mm_stream_load_si128 (__m128i* mem_addr)
Synopsis
__m128i _mm_stream_load_si128 (__m128i* mem_addr)
#include "smmintrin.h"
Instruction: movntdqa xmm, m128
CPUID Flags: SSE4.1
Description
Load 128-bits of integer data from memory into dst using a non-temporal memory hint.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
dst[127:0] := MEM[mem_addr+127:mem_addr]
vmovntdqa
__m256i _mm256_stream_load_si256 (__m256i const* mem_addr)
Synopsis
__m256i _mm256_stream_load_si256 (__m256i const* mem_addr)
#include "immintrin.h"
Instruction: vmovntdqa ymm, m256
CPUID Flags: AVX2
Description
Load 256-bits of integer data from memory into dst using a non-temporal memory hint.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0
vmovntdqa
__m512i _mm512_stream_load_si512 (void const* mem_addr)
Synopsis
__m512i _mm512_stream_load_si512 (void const* mem_addr)
#include "immintrin.h"
Instruction: vmovntdqa zmm, m512
CPUID Flags: AVX512F
Description
Load 512-bits of integer data from memory into dst using a non-temporal memory hint.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
dst[511:0] := MEM[mem_addr+511:mem_addr]
dst[MAX:512] := 0
movntpd
void _mm_stream_pd (double* mem_addr, __m128d a)
Synopsis
void _mm_stream_pd (double* mem_addr, __m128d a)
#include "emmintrin.h"
Instruction: movntpd m128, xmm
CPUID Flags: SSE2
Description
Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+127:mem_addr] := a[127:0]
vmovntpd
void _mm256_stream_pd (double * mem_addr, __m256d a)
Synopsis
void _mm256_stream_pd (double * mem_addr, __m256d a)
#include "immintrin.h"
Instruction: vmovntpd m256, ymm
CPUID Flags: AVX
Description
Store 256-bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+255:mem_addr] := a[255:0]
vmovntpd
void _mm512_stream_pd (void* mem_addr, __m512d a)
Synopsis
void _mm512_stream_pd (void* mem_addr, __m512d a)
#include "immintrin.h"
Instruction: vmovntpd m512, zmm
CPUID Flags: AVX512F
Description
Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
movntq
void _mm_stream_pi (__m64* mem_addr, __m64 a)
Synopsis
void _mm_stream_pi (__m64* mem_addr, __m64 a)
#include "xmmintrin.h"
Instruction: movntq m64, mm
CPUID Flags: SSE
Description
Store 64-bits of integer data from a into memory using a non-temporal memory hint.
Operation
MEM[mem_addr+63:mem_addr] := a[63:0]
movntps
void _mm_stream_ps (float* mem_addr, __m128 a)
Synopsis
void _mm_stream_ps (float* mem_addr, __m128 a)
#include "xmmintrin.h"
Instruction: movntps m128, xmm
CPUID Flags: SSE
Description
Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+127:mem_addr] := a[127:0]
vmovntps
void _mm256_stream_ps (float * mem_addr, __m256 a)
Synopsis
void _mm256_stream_ps (float * mem_addr, __m256 a)
#include "immintrin.h"
Instruction: vmovntps m256, ymm
CPUID Flags: AVX
Description
Store 256-bits (composed of 8 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+255:mem_addr] := a[255:0]
vmovntps
void _mm512_stream_ps (void* mem_addr, __m512 a)
Synopsis
void _mm512_stream_ps (void* mem_addr, __m512 a)
#include "immintrin.h"
Instruction: vmovntps m512, zmm
CPUID Flags: AVX512F
Description
Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
movntdq
void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
Synopsis
void _mm_stream_si128 (__m128i* mem_addr, __m128i a)
#include "emmintrin.h"
Instruction: movntdq m128, xmm
CPUID Flags: SSE2
Description
Store 128-bits of integer data from a into memory using a non-temporal memory hint.
mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+127:mem_addr] := a[127:0]
vmovntdq
void _mm256_stream_si256 (__m256i * mem_addr, __m256i a)
Synopsis
void _mm256_stream_si256 (__m256i * mem_addr, __m256i a)
#include "immintrin.h"
Instruction: vmovntdq m256, ymm
CPUID Flags: AVX
Description
Store 256-bits of integer data from a into memory using a non-temporal memory hint.
mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+255:mem_addr] := a[255:0]
movnti
void _mm_stream_si32 (int* mem_addr, int a)
Synopsis
void _mm_stream_si32 (int* mem_addr, int a)
#include "emmintrin.h"
Instruction: movnti m32, r32
CPUID Flags: SSE2
Description
Store 32-bit integer a into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address mem_addr is already in the cache, the cache will be updated.
Operation
MEM[mem_addr+31:mem_addr] := a[31:0]
vmovntdqa
void _mm512_stream_si512 (void* mem_addr, __m512i a)
Synopsis
void _mm512_stream_si512 (void* mem_addr, __m512i a)
#include "immintrin.h"
Instruction: vmovntdqa zmm, m512
CPUID Flags: AVX512F
Description
Store 512-bits of integer data from a into memory using a non-temporal memory hint.
mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
Operation
MEM[mem_addr+511:mem_addr] := a[511:0]
movnti
void _mm_stream_si64 (__int64* mem_addr, __int64 a)
Synopsis
void _mm_stream_si64 (__int64* mem_addr, __int64 a)
#include "emmintrin.h"
Instruction: movnti m64, r64
CPUID Flags: SSE2
Description
Store 64-bit integer a into memory using a non-temporal hint to minimize cache pollution. If the cache line containing address mem_addr is already in the cache, the cache will be updated.
Operation
MEM[mem_addr+63:mem_addr] := a[63:0]
vpsubw
__m128i _mm_mask_sub_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_sub_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] - b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsubw
__m128i _mm_maskz_sub_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_sub_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] - b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
psubw
__m128i _mm_sub_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sub_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubw xmm, xmm
CPUID Flags: SSE2
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
Performance
vpsubw
__m256i _mm256_mask_sub_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_sub_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] - b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubw
__m256i _mm256_maskz_sub_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_sub_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] - b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubw
__m256i _mm256_sub_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_sub_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpsubw
__m512i _mm512_mask_sub_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_sub_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] - b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubw
__m512i _mm512_maskz_sub_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_sub_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := a[i+15:i] - b[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubw
__m512i _mm512_sub_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_sub_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubw
CPUID Flags: AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
dst[MAX:512] := 0
vpsubd
__m128i _mm_mask_sub_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_sub_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubd
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsubd
__m128i _mm_maskz_sub_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_sub_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubd
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psubd
__m128i _mm_sub_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sub_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubd xmm, xmm
CPUID Flags: SSE2
Description
Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
Performance
vpsubd
__m256i _mm256_mask_sub_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_sub_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubd
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubd
__m256i _mm256_maskz_sub_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_sub_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubd
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsubd
__m256i _mm256_sub_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_sub_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpsubd
__m512i _mm512_mask_sub_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_sub_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubd
__m512i _mm512_maskz_sub_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_sub_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsubd
__m512i _mm512_sub_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_sub_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vpsubq
__m128i _mm_mask_sub_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_sub_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubq
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsubq
__m128i _mm_maskz_sub_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_sub_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubq
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psubq
__m128i _mm_sub_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sub_epi64 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubq xmm, xmm
CPUID Flags: SSE2
Description
Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
Performance
vpsubq
__m256i _mm256_mask_sub_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_sub_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubq
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubq
__m256i _mm256_maskz_sub_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_sub_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubq
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsubq
__m256i _mm256_sub_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_sub_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpsubq
__m512i _mm512_mask_sub_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_sub_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubq
__m512i _mm512_maskz_sub_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_sub_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsubq
__m512i _mm512_sub_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_sub_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vpsubb
__m128i _mm_mask_sub_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_sub_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] - b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsubb
__m128i _mm_maskz_sub_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_sub_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] - b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
psubb
__m128i _mm_sub_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_sub_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubb xmm, xmm
CPUID Flags: SSE2
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
Performance
vpsubb
__m256i _mm256_mask_sub_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_sub_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] - b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubb
__m256i _mm256_maskz_sub_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_sub_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] - b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubb
__m256i _mm256_sub_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_sub_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
dst[MAX:256] := 0
Performance
vpsubb
__m512i _mm512_mask_sub_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_sub_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] - b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubb
__m512i _mm512_maskz_sub_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_sub_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := a[i+7:i] - b[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubb
__m512i _mm512_sub_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_sub_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubb
CPUID Flags: AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
dst[MAX:512] := 0
vsubpd
__m128d _mm_mask_sub_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_sub_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsubpd
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vsubpd
__m128d _mm_maskz_sub_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_sub_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsubpd
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
subpd
__m128d _mm_sub_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_sub_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: subpd xmm, xmm
CPUID Flags: SSE2
Description
Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
Performance
vsubpd
__m256d _mm256_mask_sub_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_sub_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vsubpd
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vsubpd
__m256d _mm256_maskz_sub_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_sub_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vsubpd
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vsubpd
__m256d _mm256_sub_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_sub_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vsubpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vsubpd
__m512d _mm512_mask_sub_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_sub_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vsubpd
__m512d _mm512_maskz_sub_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_sub_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vsubpd
__m512d _mm512_sub_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_sub_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vsubps
__m128 _mm_mask_sub_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_sub_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsubps
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vsubps
__m128 _mm_maskz_sub_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_sub_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsubps
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
subps
__m128 _mm_sub_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_sub_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: subps xmm, xmm
CPUID Flags: SSE
Description
Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
Performance
vsubps
__m256 _mm256_mask_sub_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_sub_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vsubps
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vsubps
__m256 _mm256_maskz_sub_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_sub_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vsubps
CPUID Flags: AVX512VL + AVX512F
Description
Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vsubps
__m256 _mm256_sub_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_sub_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vsubps ymm, ymm, ymm
CPUID Flags: AVX
Description
Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vsubps
__m512 _mm512_mask_sub_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_sub_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vsubps
__m512 _mm512_maskz_sub_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_sub_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vsubps
__m512 _mm512_sub_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_sub_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vsubpd
__m512d _mm512_mask_sub_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_mask_sub_round_pd (__m512d src, __mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed double-precision (64-bit) floating-point elements in
b from packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vsubpd
__m512d _mm512_maskz_sub_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_maskz_sub_round_pd (__mmask8 k, __m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Subtract packed double-precision (64-bit) floating-point elements in
b from packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vsubpd
__m512d _mm512_sub_round_pd (__m512d a, __m512d b, int rounding)
Synopsis
__m512d _mm512_sub_round_pd (__m512d a, __m512d b, int rounding)
#include "immintrin.h"
Instruction: vsubpd zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed double-precision (64-bit) floating-point elements in
b from packed double-precision (64-bit) floating-point elements in
a, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vsubps
__m512 _mm512_mask_sub_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_mask_sub_round_ps (__m512 src, __mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed single-precision (32-bit) floating-point elements in
b from packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vsubps
__m512 _mm512_maskz_sub_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_maskz_sub_round_ps (__mmask16 k, __m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F
Description
Subtract packed single-precision (32-bit) floating-point elements in
b from packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst using zeromask
k (elements are zeroed out when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vsubps
__m512 _mm512_sub_round_ps (__m512 a, __m512 b, int rounding)
Synopsis
__m512 _mm512_sub_round_ps (__m512 a, __m512 b, int rounding)
#include "immintrin.h"
Instruction: vsubps zmm {k}, zmm, zmm {er}
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Subtract packed single-precision (32-bit) floating-point elements in
b from packed single-precision (32-bit) floating-point elements in
a, and store the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vsubsd
__m128d _mm_mask_sub_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_mask_sub_round_sd (__m128d src, __mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Subtract the lower double-precision (64-bit) floating-point element in
b from the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] - b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vsubsd
__m128d _mm_maskz_sub_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_maskz_sub_round_sd (__mmask8 k, __m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Subtract the lower double-precision (64-bit) floating-point element in
b from the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[63:0] := a[63:0] - b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vsubsd
__m128d _mm_sub_round_sd (__m128d a, __m128d b, int rounding)
Synopsis
__m128d _mm_sub_round_sd (__m128d a, __m128d b, int rounding)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Subtract the lower double-precision (64-bit) floating-point element in
b from the lower double-precision (64-bit) floating-point element in
a, store the result in the lower element of
dst, and copy the upper element from
a to the upper element of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[63:0] := a[63:0] - b[63:0]
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vsubss
__m128 _mm_mask_sub_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_mask_sub_round_ss (__m128 src, __mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Subtract the lower single-precision (32-bit) floating-point element in
b from the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst using writemask
k (the element is copied from
src when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] - b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vsubss
__m128 _mm_maskz_sub_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_maskz_sub_round_ss (__mmask8 k, __m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Subtract the lower single-precision (32-bit) floating-point element in
b from the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst using zeromask
k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
IF k[0]
dst[31:0] := a[31:0] - b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vsubss
__m128 _mm_sub_round_ss (__m128 a, __m128 b, int rounding)
Synopsis
__m128 _mm_sub_round_ss (__m128 a, __m128 b, int rounding)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm {er}
CPUID Flags: AVX512F
Description
Subtract the lower single-precision (32-bit) floating-point element in
b from the lower single-precision (32-bit) floating-point element in
a, store the result in the lower element of
dst, and copy the upper 3 packed elements from
a to the upper elements of
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
dst[31:0] := a[31:0] - b[31:0]
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vsubsd
__m128d _mm_mask_sub_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_sub_sd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] - b[63:0]
ELSE
dst[63:0] := src[63:0]
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
vsubsd
__m128d _mm_maskz_sub_sd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_sub_sd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vsubsd xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
Operation
IF k[0]
dst[63:0] := a[63:0] - b[63:0]
ELSE
dst[63:0] := 0
FI
dst[127:64] := a[127:64]
dst[MAX:128] := 0
subsd
__m128d _mm_sub_sd (__m128d a, __m128d b)
Synopsis
__m128d _mm_sub_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: subsd xmm, xmm
CPUID Flags: SSE2
Description
Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
Operation
dst[63:0] := a[63:0] - b[63:0]
dst[127:64] := a[127:64]
Performance
psubq
__m64 _mm_sub_si64 (__m64 a, __m64 b)
Synopsis
__m64 _mm_sub_si64 (__m64 a, __m64 b)
#include "emmintrin.h"
Instruction: psubq mm, mm
CPUID Flags: SSE2
Description
Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
Operation
dst[63:0] := a[63:0] - b[63:0]
Performance
vsubss
__m128 _mm_mask_sub_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_sub_ss (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] - b[31:0]
ELSE
dst[31:0] := src[31:0]
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
vsubss
__m128 _mm_maskz_sub_ss (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_sub_ss (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vsubss xmm {k}, xmm, xmm
CPUID Flags: AVX512F
Description
Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
IF k[0]
dst[31:0] := a[31:0] - b[31:0]
ELSE
dst[31:0] := 0
FI
dst[127:32] := a[127:32]
dst[MAX:128] := 0
subss
__m128 _mm_sub_ss (__m128 a, __m128 b)
Synopsis
__m128 _mm_sub_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: subss xmm, xmm
CPUID Flags: SSE
Description
Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
Operation
dst[31:0] := a[31:0] - b[31:0]
dst[127:32] := a[127:32]
Performance
sbb
unsigned char _subborrow_u32 (unsigned char b_in, unsigned int a, unsigned int b, unsigned int * out)
Synopsis
unsigned char _subborrow_u32 (unsigned char b_in, unsigned int a, unsigned int b, unsigned int * out)
#include "immintrin.h"
Instruction: sbb r32, r32
Description
Add unsigned 8-bit borrow b_in (carry flag) to unsigned 32-bit integer a, and subtract the result from unsigned 32-bit integer b. Store the unsigned 32-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
dst:out[31:0] := (b[31:0] - (a[31:0] + b_in));
sbb
unsigned char _subborrow_u64 (unsigned char b_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
Synopsis
unsigned char _subborrow_u64 (unsigned char b_in, unsigned __int64 a, unsigned __int64 b, unsigned __int64 * out)
#include "immintrin.h"
Instruction: sbb r64, r64
Description
Add unsigned 8-bit borrow b_in (carry flag) to unsigned 64-bit integer a, and subtract the result from unsigned 64-bit integer b. Store the unsigned 64-bit result in out, and the carry-out in dst (carry or overflow flag).
Operation
dst:out[63:0] := (b[63:0] - (a[63:0] + b_in));
vpsubrd
__m512i _mm512_mask_subr_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3)
Synopsis
__m512i _mm512_mask_subr_epi32 (__m512i src, __mmask16 k, __m512i v2, __m512i v3)
#include "immintrin.h"
Instruction: vpsubrd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed 32-bit integer elements in v2 from v3 storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set)
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubrd
__m512i _mm512_subr_epi32 (__m512i v2, __m512i v3)
Synopsis
__m512i _mm512_subr_epi32 (__m512i v2, __m512i v3)
#include "immintrin.h"
Instruction: vpsubrd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed 32-bit integer elements in v2 from v3 storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
ENDFOR
dst[MAX:512] := 0
vsubrpd
__m512d _mm512_mask_subr_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)
Synopsis
__m512d _mm512_mask_subr_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3)
#include "immintrin.h"
Instruction: vsubrpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in v2 from v3 storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vsubrpd
__m512d _mm512_subr_pd (__m512d v2, __m512d v3)
Synopsis
__m512d _mm512_subr_pd (__m512d v2, __m512d v3)
#include "immintrin.h"
Instruction: vsubrpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in v2 from v3 storing the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
ENDFOR
dst[MAX:512] := 0
vsubrps
__m512 _mm512_mask_subr_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)
Synopsis
__m512 _mm512_mask_subr_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3)
#include "immintrin.h"
Instruction: vsubrps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in v2 from v3 storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vsubrps
__m512 _mm512_subr_ps (__m512 v2, __m512 v3)
Synopsis
__m512 _mm512_subr_ps (__m512 v2, __m512 v3)
#include "immintrin.h"
Instruction: vsubrps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in v2 from v3 storing the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
ENDFOR
dst[MAX:512] := 0
vsubrpd
__m512d _mm512_mask_subr_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)
Synopsis
__m512d _mm512_mask_subr_round_pd (__m512d src, __mmask8 k, __m512d v2, __m512d v3, int rounding)
#include "immintrin.h"
Instruction: vsubrpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in
v2 from
v3 storing the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vsubrpd
__m512d _mm512_subr_round_pd (__m512d v2, __m512d v3, int rounding)
Synopsis
__m512d _mm512_subr_round_pd (__m512d v2, __m512d v3, int rounding)
#include "immintrin.h"
Instruction: vsubrpd zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed double-precision (64-bit) floating-point elements in
v2 from
v3 storing the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := v3[i+63:i] - v2[i+63:i]
ENDFOR
dst[MAX:512] := 0
vsubrps
__m512 _mm512_mask_subr_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)
Synopsis
__m512 _mm512_mask_subr_round_ps (__m512 src, __mmask16 k, __m512 v2, __m512 v3, int rounding)
#include "immintrin.h"
Instruction: vsubrps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in
v2 from
v3 storing the results in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vsubrps
__m512 _mm512_subr_round_ps (__m512 v2, __m512 v3, int rounding)
Synopsis
__m512 _mm512_subr_round_ps (__m512 v2, __m512 v3, int rounding)
#include "immintrin.h"
Instruction: vsubrps zmm {k}, zmm, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed single-precision (32-bit) floating-point elements in
v2 from
v3 storing the results in
dst.
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v3[i+31:i] - v2[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpsubrsetbd
__m512i _mm512_mask_subrsetb_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * borrow)
Synopsis
__m512i _mm512_mask_subrsetb_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsubrsetbd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed 32-bit integer elements in v2 from v3, storing the results in dst and v2. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag). Results are written using writemask k (elements are copied from k to k_old when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
diff := v3[i+31:i] - v2[i+31:i]
borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i])
dst[i+31:i] := diff
v2[i+31:i] := diff
ELSE
borrow[j] := k_old[j]
FI
ENDFOR
dst[MAX:512] := 0
vpsubrsetbd
__m512i _mm512_subrsetb_epi32 (__m512i v2, __m512i v3, __mmask16 * borrow)
Synopsis
__m512i _mm512_subrsetb_epi32 (__m512i v2, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsubrsetbd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed 32-bit integer elements in v2 from v3, storing the results in dst and v2. The borrowed value from the subtraction difference for the nth element is written to the nth bit of borrow (borrow flag).
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v3[i+31:i] - v2[i+31:i]
borrow[j] := Borrow(v3[i+31:i] - v2[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpsubsw
__m128i _mm_mask_subs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_subs_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsubsw
__m128i _mm_maskz_subs_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_subs_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psubsw
__m128i _mm_subs_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_subs_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubsw xmm, xmm
CPUID Flags: SSE2
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
Performance
vpsubsw
__m256i _mm256_mask_subs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_subs_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubsw
__m256i _mm256_maskz_subs_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_subs_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsubsw
__m256i _mm256_subs_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_subs_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpsubsw
__m512i _mm512_mask_subs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_subs_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubsw
__m512i _mm512_maskz_subs_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_subs_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsubsw
__m512i _mm512_subs_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_subs_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsw
CPUID Flags: AVX512BW
Description
Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:512] := 0
vpsubsb
__m128i _mm_mask_subs_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_subs_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsubsb
__m128i _mm_maskz_subs_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_subs_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psubsb
__m128i _mm_subs_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_subs_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubsb xmm, xmm
CPUID Flags: SSE2
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
Performance
vpsubsb
__m256i _mm256_mask_subs_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_subs_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubsb
__m256i _mm256_maskz_subs_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_subs_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsubsb
__m256i _mm256_subs_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_subs_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubsb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpsubsb
__m512i _mm512_mask_subs_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_subs_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubsb
__m512i _mm512_maskz_subs_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_subs_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsubsb
__m512i _mm512_subs_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_subs_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubsb
CPUID Flags: AVX512BW
Description
Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:512] := 0
vpsubusw
__m128i _mm_mask_subs_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_subs_epu16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsubusw
__m128i _mm_maskz_subs_epu16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_subs_epu16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psubusw
__m128i _mm_subs_epu16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_subs_epu16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubusw xmm, xmm
CPUID Flags: SSE2
Description
Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*16
dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
Performance
vpsubusw
__m256i _mm256_mask_subs_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_subs_epu16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubusw
__m256i _mm256_maskz_subs_epu16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_subs_epu16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsubusw
__m256i _mm256_subs_epu16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_subs_epu16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*16
dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpsubusw
__m512i _mm512_mask_subs_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_subs_epu16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512BW
Description
Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubusw
__m512i _mm512_maskz_subs_epu16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_subs_epu16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512BW
Description
Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsubusw
__m512i _mm512_subs_epu16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_subs_epu16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusw
CPUID Flags: AVX512BW
Description
Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*16
dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:512] := 0
vpsubusb
__m128i _mm_mask_subs_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_subs_epu8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpsubusb
__m128i _mm_maskz_subs_epu8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_subs_epu8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
psubusb
__m128i _mm_subs_epu8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_subs_epu8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: psubusb xmm, xmm
CPUID Flags: SSE2
Description
Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*8
dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
Performance
vpsubusb
__m256i _mm256_mask_subs_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_subs_epu8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpsubusb
__m256i _mm256_maskz_subs_epu8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_subs_epu8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512VL + AVX512BW
Description
Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpsubusb
__m256i _mm256_subs_epu8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_subs_epu8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpsubusb ymm, ymm, ymm
CPUID Flags: AVX2
Description
Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 31
i := j*8
dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:256] := 0
Performance
vpsubusb
__m512i _mm512_mask_subs_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_subs_epu8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512BW
Description
Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpsubusb
__m512i _mm512_maskz_subs_epu8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_subs_epu8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512BW
Description
Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpsubusb
__m512i _mm512_subs_epu8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_subs_epu8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpsubusb
CPUID Flags: AVX512BW
Description
Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation, and store the results in dst.
Operation
FOR j := 0 to 63
i := j*8
dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:512] := 0
vpsubsetbd
__m512i _mm512_mask_subsetb_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * borrow)
Synopsis
__m512i _mm512_mask_subsetb_epi32 (__m512i v2, __mmask16 k, __mmask16 k_old, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsubsetbd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed 32-bit integer elements in v3 from v2, storing the results in dst and the nth borrow bit in the nth position of borrow (borrow flag). Results are stored using writemask k (elements are copied from v2 and k_old when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := v2[i+31:i] - v3[i+31:i]
borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
borrow[j] := k_old[j]
FI
ENDFOR
dst[MAX:512] := 0
vpsubsetbd
__m512i _mm512_subsetb_epi32 (__m512i v2, __m512i v3, __mmask16 * borrow)
Synopsis
__m512i _mm512_subsetb_epi32 (__m512i v2, __m512i v3, __mmask16 * borrow)
#include "immintrin.h"
Instruction: vpsubsetbd zmm {k}, k, zmm
CPUID Flags: KNCNI
Description
Performs element-by-element subtraction of packed 32-bit integer elements in v3 from v2, storing the results in dst and the nth borrow bit in the nth position of borrow (borrow flag).
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := v2[i+31:i] - v3[i+31:i]
borrow[j] := Borrow(v2[i+31:i] - v3[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_svml_ceil_pd (__m128d a)
Synopsis
__m128d _mm_svml_ceil_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_svml_ceil_pd (__m256d a)
Synopsis
__m256d _mm256_svml_ceil_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Round the packed double-precision (64-bit) floating-point elements in a up to an integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m128 _mm_svml_ceil_ps (__m128 a)
Synopsis
__m128 _mm_svml_ceil_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_svml_ceil_ps (__m256 a)
Synopsis
__m256 _mm256_svml_ceil_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Round the packed single-precision (32-bit) floating-point elements in a up to an integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m128d _mm_svml_floor_pd (__m128d a)
Synopsis
__m128d _mm_svml_floor_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_svml_floor_pd (__m256d a)
Synopsis
__m256d _mm256_svml_floor_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Round the packed double-precision (64-bit) floating-point elements in a down to an integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m128 _mm_svml_floor_ps (__m128 a)
Synopsis
__m128 _mm_svml_floor_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_svml_floor_ps (__m256 a)
Synopsis
__m256 _mm256_svml_floor_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Round the packed single-precision (32-bit) floating-point elements in a down to an integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m128d _mm_svml_round_pd (__m128d a)
Synopsis
__m128d _mm_svml_round_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Round the packed double-precision (64-bit) floating-point elements in a to the nearest integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_svml_round_pd (__m256d a)
Synopsis
__m256d _mm256_svml_round_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Round the packed double-precision (64-bit) floating-point elements in a to the nearest integer value, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_svml_round_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_svml_round_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed double-precision (64-bit) floating-point elements in
a to the nearest integer value, and store the results as packed double-precision floating-point elements in
dst using writemask
k (elements are copied from
src when the corresponding mask bit is not set).
Rounding is done according to the
rounding parameter, which can be one of:
(_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
(_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and suppress exceptions
(_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress exceptions
(_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress exceptions
_MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := ROUND(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_svml_round_pd (__m512d a)
Synopsis
__m512d _mm512_svml_round_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Round the packed double-precision (64-bit) floating-point elements in a to the nearest integer value, and store the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_svml_round_ps (__m128 a)
Synopsis
__m128 _mm_svml_round_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Round the packed single-precision (32-bit) floating-point elements in a to the nearest integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := ROUND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_svml_round_ps (__m256 a)
Synopsis
__m256 _mm256_svml_round_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Round the packed single-precision (32-bit) floating-point elements in a to the nearest integer value, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := ROUND(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m128d _mm_svml_sqrt_pd (__m128d a)
Synopsis
__m128d _mm_svml_sqrt_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. Note that this intrinsic is less efficient than _mm_sqrt_pd.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_svml_sqrt_pd (__m256d a)
Synopsis
__m256d _mm256_svml_sqrt_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. Note that this intrinsic is less efficient than _mm_sqrt_pd.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m128 _mm_svml_sqrt_ps (__m128 a)
Synopsis
__m128 _mm_svml_sqrt_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. Note that this intrinsic is less efficient than _mm_sqrt_ps.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_svml_sqrt_ps (__m256 a)
Synopsis
__m256 _mm256_svml_sqrt_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. Note that this intrinsic is less efficient than _mm_sqrt_ps.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512i _mm512_mask_swizzle_epi32 (__m512i src, __mmask16 k, __m512i v, _MM_SWIZZLE_ENUM s)
Synopsis
__m512i _mm512_mask_swizzle_epi32 (__m512i src, __mmask16 k, __m512i v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Performs a swizzle transformation of each of the four groups of packed 4x32-bit integer elements in v using swizzle parameter s, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE s OF
_MM_SWIZ_REG_NONE:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
FOR j := 0 to 7
i := j*64
IF k[j*2]
dst[i+31:i] := v[i+63:i+32]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*2+1]
dst[i+63:i+32] := v[i+31:i]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
ENDFOR
_MM_SWIZ_REG_BADC:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+95:i+64]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+127:i+96]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+31:i]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+63:i+32]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_AAAA:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+31:i]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+31:i]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+31:i]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_BBBB:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+63:i+32]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+63:i+32]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+63:i+32]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+63:i+32]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_CCCC:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+95:i+64]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+95:i+64]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+95:i+64]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+95:i+64]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_DDDD:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+127:i+96]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+127:i+96]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+127:i+96]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+127:i+96]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_DACB:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+63:i+32]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+95:i+64]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+31:i]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+127:i+96]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
ESAC
dst[MAX:512] := 0
...
__m512i _mm512_swizzle_epi32 (__m512i v, _MM_SWIZZLE_ENUM s)
Synopsis
__m512i _mm512_swizzle_epi32 (__m512i v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Performs a swizzle transformation of each of the four groups of packed 4x 32-bit integer elements in v using swizzle parameter s, storing the results in dst.
Operation
CASE s OF
_MM_SWIZ_REG_NONE:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
FOR j := 0 to 7
i := j*64
dst[i+31:i] := v[i+63:i+32]
dst[i+63:i+32] := v[i+31:i]
ENDFOR
_MM_SWIZ_REG_BADC:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+95:i+64]
dst[i+63:i+32] := v[i+127:i+96]
dst[i+95:i+64] := v[i+31:i]
dst[i+127:i+96] := v[i+63:i+32]
ENDFOR
_MM_SWIZ_REG_AAAA:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+31:i]
dst[i+63:i+32] := v[i+31:i]
dst[i+95:i+64] := v[i+31:i]
dst[i+127:i+96] := v[i+31:i]
ENDFOR
_MM_SWIZ_REG_BBBB:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+63:i+32]
dst[i+63:i+32] := v[i+63:i+32]
dst[i+95:i+64] := v[i+63:i+32]
dst[i+127:i+96] := v[i+63:i+32]
ENDFOR
_MM_SWIZ_REG_CCCC:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+95:i+64]
dst[i+63:i+32] := v[i+95:i+64]
dst[i+95:i+64] := v[i+95:i+64]
dst[i+127:i+96] := v[i+95:i+64]
ENDFOR
_MM_SWIZ_REG_DDDD:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+127:i+96]
dst[i+63:i+32] := v[i+127:i+96]
dst[i+95:i+64] := v[i+127:i+96]
dst[i+127:i+96] := v[i+127:i+96]
ENDFOR
_MM_SWIZ_REG_DACB:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+63:i+32]
dst[i+63:i+32] := v[i+95:i+64]
dst[i+95:i+64] := v[i+31:i]
dst[i+127:i+96] := v[i+127:i+96]
ENDFOR
ESAC
dst[MAX:512] := 0
...
__m512i _mm512_mask_swizzle_epi64 (__m512i src, __mmask8 k, __m512i v, _MM_SWIZZLE_ENUM s)
Synopsis
__m512i _mm512_mask_swizzle_epi64 (__m512i src, __mmask8 k, __m512i v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Performs a swizzle transformation of each of the four groups of packed 4x64-bit integer elements in v using swizzle parameter s, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE s OF
_MM_SWIZ_REG_NONE:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
FOR j := 0 to 3
i := j*64
IF k[j*2]
dst[i+63:i] := v[i+127:i+64]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*2+1]
dst[i+127:i+64] := v[i+63:i]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
ENDFOR
_MM_SWIZ_REG_BADC:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+191:i+128]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+255:i+192]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+63:i]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+127:i+64]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_AAAA:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+63:i]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+63:i]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+63:i]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_BBBB:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+127:i+63]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+127:i+63]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+127:i+63]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+127:i+63]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_CCCC:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+191:i+128]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+191:i+128]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+191:i+128]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+191:i+128]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_DDDD:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+255:i+192]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+255:i+192]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+255:i+192]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+255:i+192]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_DACB:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+127:i+64]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+191:i+128]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+63:i]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+255:i+192]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
ESAC
dst[MAX:512] := 0
...
__m512i _mm512_swizzle_epi64 (__m512i v, _MM_SWIZZLE_ENUM s)
Synopsis
__m512i _mm512_swizzle_epi64 (__m512i v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Performs a swizzle transformation of each of the two groups of packed 4x64-bit integer elements in v using swizzle parameter s, storing the results in dst.
Operation
CASE s OF
_MM_SWIZ_REG_NONE:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
FOR j := 0 to 3
i := j*64
dst[i+63:i] := v[i+127:i+64]
dst[i+127:i+64] := v[i+63:i]
ENDFOR
_MM_SWIZ_REG_BADC:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+191:i+128]
dst[i+127:i+64] := v[i+255:i+192]
dst[i+191:i+128] := v[i+63:i]
dst[i+255:i+192] := v[i+127:i+64]
ENDFOR
_MM_SWIZ_REG_AAAA:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+63:i]
dst[i+127:i+64] := v[i+63:i]
dst[i+191:i+128] := v[i+63:i]
dst[i+255:i+192] := v[i+63:i]
ENDFOR
_MM_SWIZ_REG_BBBB:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+127:i+63]
dst[i+127:i+64] := v[i+127:i+63]
dst[i+191:i+128] := v[i+127:i+63]
dst[i+255:i+192] := v[i+127:i+63]
ENDFOR
_MM_SWIZ_REG_CCCC:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+191:i+128]
dst[i+127:i+64] := v[i+191:i+128]
dst[i+191:i+128] := v[i+191:i+128]
dst[i+255:i+192] := v[i+191:i+128]
ENDFOR
_MM_SWIZ_REG_DDDD:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+255:i+192]
dst[i+127:i+64] := v[i+255:i+192]
dst[i+191:i+128] := v[i+255:i+192]
dst[i+255:i+192] := v[i+255:i+192]
ENDFOR
_MM_SWIZ_REG_DACB:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+127:i+64]
dst[i+127:i+64] := v[i+191:i+128]
dst[i+191:i+128] := v[i+63:i]
dst[i+255:i+192] := v[i+255:i+192]
ENDFOR
ESAC
dst[MAX:512] := 0
...
__m512d _mm512_mask_swizzle_pd (__m512d src, __mmask8 k, __m512d v, _MM_SWIZZLE_ENUM s)
Synopsis
__m512d _mm512_mask_swizzle_pd (__m512d src, __mmask8 k, __m512d v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in v using swizzle parameter s, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE s OF
_MM_SWIZ_REG_NONE:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
FOR j := 0 to 3
i := j*64
IF k[j*2]
dst[i+63:i] := v[i+127:i+64]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*2+1]
dst[i+127:i+64] := v[i+63:i]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
ENDFOR
_MM_SWIZ_REG_BADC:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+191:i+128]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+255:i+192]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+63:i]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+127:i+64]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_AAAA:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+63:i]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+63:i]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+63:i]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_BBBB:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+127:i+63]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+127:i+63]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+127:i+63]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+127:i+63]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_CCCC:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+191:i+128]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+191:i+128]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+191:i+128]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+191:i+128]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_DDDD:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+255:i+192]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+255:i+192]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+255:i+192]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+255:i+192]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
_MM_SWIZ_REG_DACB:
FOR j := 0 to 1
i := j*256
IF k[j*4]
dst[i+63:i] := v[i+127:i+64]
ELSE
dst[i+63:i] := src[i+63:i]
FI
IF k[j*4+1]
dst[i+127:i+64] := v[i+191:i+128]
ELSE
dst[i+127:i+64] := src[i+127:i+64]
FI
IF k[j*4+2]
dst[i+191:i+128] := v[i+63:i]
ELSE
dst[i+191:i+128] := src[i+191:i+128]
FI
IF k[j*4+3]
dst[i+255:i+192] := v[i+255:i+192]
ELSE
dst[i+255:i+192] := src[i+255:i+192]
FI
ENDFOR
ESAC
dst[MAX:512] := 0
...
__m512d _mm512_swizzle_pd (__m512d v, _MM_SWIZZLE_ENUM s)
Synopsis
__m512d _mm512_swizzle_pd (__m512d v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Performs a swizzle transformation of each of the two groups of packed 4x double-precision (64-bit) floating-point elements in v using swizzle parameter s, storing the results in dst.
Operation
CASE s OF
_MM_SWIZ_REG_NONE:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
FOR j := 0 to 3
i := j*64
dst[i+63:i] := v[i+127:i+64]
dst[i+127:i+64] := v[i+63:i]
ENDFOR
_MM_SWIZ_REG_BADC:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+191:i+128]
dst[i+127:i+64] := v[i+255:i+192]
dst[i+191:i+128] := v[i+63:i]
dst[i+255:i+192] := v[i+127:i+64]
ENDFOR
_MM_SWIZ_REG_AAAA:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+63:i]
dst[i+127:i+64] := v[i+63:i]
dst[i+191:i+128] := v[i+63:i]
dst[i+255:i+192] := v[i+63:i]
ENDFOR
_MM_SWIZ_REG_BBBB:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+127:i+63]
dst[i+127:i+64] := v[i+127:i+63]
dst[i+191:i+128] := v[i+127:i+63]
dst[i+255:i+192] := v[i+127:i+63]
ENDFOR
_MM_SWIZ_REG_CCCC:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+191:i+128]
dst[i+127:i+64] := v[i+191:i+128]
dst[i+191:i+128] := v[i+191:i+128]
dst[i+255:i+192] := v[i+191:i+128]
ENDFOR
_MM_SWIZ_REG_DDDD:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+255:i+192]
dst[i+127:i+64] := v[i+255:i+192]
dst[i+191:i+128] := v[i+255:i+192]
dst[i+255:i+192] := v[i+255:i+192]
ENDFOR
_MM_SWIZ_REG_DACB:
FOR j := 0 to 1
i := j*256
dst[i+63:i] := v[i+127:i+64]
dst[i+127:i+64] := v[i+191:i+128]
dst[i+191:i+128] := v[i+63:i]
dst[i+255:i+192] := v[i+255:i+192]
ENDFOR
ESAC
dst[MAX:512] := 0
...
__m512 _mm512_mask_swizzle_ps (__m512 src, __mmask16 k, __m512 v, _MM_SWIZZLE_ENUM s)
Synopsis
__m512 _mm512_mask_swizzle_ps (__m512 src, __mmask16 k, __m512 v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Performs a swizzle transformation of each of the four groups of packed 4x single-precision (32-bit) floating-point elements in v using swizzle parameter s, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
CASE s OF
_MM_SWIZ_REG_NONE:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
FOR j := 0 to 7
i := j*64
IF k[j*2]
dst[i+31:i] := v[i+63:i+32]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*2+1]
dst[i+63:i+32] := v[i+31:i]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
ENDFOR
_MM_SWIZ_REG_BADC:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+95:i+64]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+127:i+96]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+31:i]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+63:i+32]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_AAAA:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+31:i]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+31:i]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+31:i]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_BBBB:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+63:i+32]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+63:i+32]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+63:i+32]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+63:i+32]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_CCCC:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+95:i+64]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+95:i+64]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+95:i+64]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+95:i+64]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_DDDD:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+127:i+96]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+127:i+96]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+127:i+96]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+127:i+96]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
_MM_SWIZ_REG_DACB:
FOR j := 0 to 3
i := j*128
IF k[j*4]
dst[i+31:i] := v[i+63:i+32]
ELSE
dst[i+31:i] := src[i+31:i]
FI
IF k[j*4+1]
dst[i+63:i+32] := v[i+95:i+64]
ELSE
dst[i+63:i+32] := src[i+63:i+32]
FI
IF k[j*4+2]
dst[i+95:i+64] := v[i+31:i]
ELSE
dst[i+95:i+64] := src[i+95:i+64]
FI
IF k[j*4+3]
dst[i+127:i+96] := v[i+127:i+96]
ELSE
dst[i+127:i+96] := src[i+127:i+96]
FI
ENDFOR
ESAC
dst[MAX:512] := 0
...
__m512 _mm512_swizzle_ps (__m512 v, _MM_SWIZZLE_ENUM s)
Synopsis
__m512 _mm512_swizzle_ps (__m512 v, _MM_SWIZZLE_ENUM s)
#include "immintrin.h"
CPUID Flags: KNCNI
Description
Performs a swizzle transformation of each of the four groups of packed 4xsingle-precision (32-bit) floating-point elements in v using swizzle parameter s, storing the results in dst.
Operation
CASE s OF
_MM_SWIZ_REG_NONE:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_DCBA:
dst[511:0] := v[511:0]
_MM_SWIZ_REG_CDAB:
FOR j := 0 to 7
i := j*64
dst[i+31:i] := v[i+63:i+32]
dst[i+63:i+32] := v[i+31:i]
ENDFOR
_MM_SWIZ_REG_BADC:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+95:i+64]
dst[i+63:i+32] := v[i+127:i+96]
dst[i+95:i+64] := v[i+31:i]
dst[i+127:i+96] := v[i+63:i+32]
ENDFOR
_MM_SWIZ_REG_AAAA:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+31:i]
dst[i+63:i+32] := v[i+31:i]
dst[i+95:i+64] := v[i+31:i]
dst[i+127:i+96] := v[i+31:i]
ENDFOR
_MM_SWIZ_REG_BBBB:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+63:i+32]
dst[i+63:i+32] := v[i+63:i+32]
dst[i+95:i+64] := v[i+63:i+32]
dst[i+127:i+96] := v[i+63:i+32]
ENDFOR
_MM_SWIZ_REG_CCCC:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+95:i+64]
dst[i+63:i+32] := v[i+95:i+64]
dst[i+95:i+64] := v[i+95:i+64]
dst[i+127:i+96] := v[i+95:i+64]
ENDFOR
_MM_SWIZ_REG_DDDD:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+127:i+96]
dst[i+63:i+32] := v[i+127:i+96]
dst[i+95:i+64] := v[i+127:i+96]
dst[i+127:i+96] := v[i+127:i+96]
ENDFOR
_MM_SWIZ_REG_DACB:
FOR j := 0 to 3
i := j*128
dst[i+31:i] := v[i+63:i+32]
dst[i+63:i+32] := v[i+95:i+64]
dst[i+95:i+64] := v[i+31:i]
dst[i+127:i+96] := v[i+127:i+96]
ENDFOR
ESAC
dst[MAX:512] := 0
...
__m128d _mm_tan_pd (__m128d a)
Synopsis
__m128d _mm_tan_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := TAN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_tan_pd (__m256d a)
Synopsis
__m256d _mm256_tan_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := TAN(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_tan_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_tan_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := TAN(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_tan_pd (__m512d a)
Synopsis
__m512d _mm512_tan_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := TAN(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_tan_ps (__m128 a)
Synopsis
__m128 _mm_tan_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := TAN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_tan_ps (__m256 a)
Synopsis
__m256 _mm256_tan_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := TAN(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_tan_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_tan_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := TAN(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_tan_ps (__m512 a)
Synopsis
__m512 _mm512_tan_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := TAN(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_tand_pd (__m128d a)
Synopsis
__m128d _mm_tand_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := TAND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_tand_pd (__m256d a)
Synopsis
__m256d _mm256_tand_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := TAND(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_tand_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_tand_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := TAND(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_tand_pd (__m512d a)
Synopsis
__m512d _mm512_tand_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the tangent of packed double-precision (64-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := TAND(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_tand_ps (__m128 a)
Synopsis
__m128 _mm_tand_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := TAND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_tand_ps (__m256 a)
Synopsis
__m256 _mm256_tand_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := TAND(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_tand_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_tand_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := TAND(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_tand_ps (__m512 a)
Synopsis
__m512 _mm512_tand_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the tangent of packed single-precision (32-bit) floating-point elements in a expressed in degrees, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := TAND(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
...
__m128d _mm_tanh_pd (__m128d a)
Synopsis
__m128d _mm_tanh_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := TANH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_tanh_pd (__m256d a)
Synopsis
__m256d _mm256_tanh_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := TANH(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_tanh_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_tanh_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := TANH(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_tanh_pd (__m512d a)
Synopsis
__m512d _mm512_tanh_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := TANH(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_tanh_ps (__m128 a)
Synopsis
__m128 _mm_tanh_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := TANH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_tanh_ps (__m256 a)
Synopsis
__m256 _mm256_tanh_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := TANH(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_tanh_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_tanh_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := TANH(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_tanh_ps (__m512 a)
Synopsis
__m512 _mm512_tanh_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in a expressed in radians, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := TANH(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
vpternlogd
__m128i _mm_mask_ternarylogic_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)
Synopsis
__m128i _mm_mask_ternarylogic_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
FOR h := 0 to 31
index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpternlogd
__m128i _mm_maskz_ternarylogic_epi32 (__mmask8 k, __m128i a, __m128i b, __m128i c, int imm8)
Synopsis
__m128i _mm_maskz_ternarylogic_epi32 (__mmask8 k, __m128i a, __m128i b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
FOR h := 0 to 31
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpternlogd
__m128i _mm_ternarylogic_epi32 (__m128i a, __m128i b, __m128i c, int imm8)
Synopsis
__m128i _mm_ternarylogic_epi32 (__m128i a, __m128i b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
Operation
FOR j := 0 to 3
i := j*32
FOR h := 0 to 31
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ENDFOR
dst[MAX:128] := 0
vpternlogd
__m256i _mm256_mask_ternarylogic_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b, int imm8)
Synopsis
__m256i _mm256_mask_ternarylogic_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
FOR h := 0 to 31
index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpternlogd
__m256i _mm256_maskz_ternarylogic_epi32 (__mmask8 k, __m256i a, __m256i b, __m256i c, int imm8)
Synopsis
__m256i _mm256_maskz_ternarylogic_epi32 (__mmask8 k, __m256i a, __m256i b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
FOR h := 0 to 31
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpternlogd
__m256i _mm256_ternarylogic_epi32 (__m256i a, __m256i b, __m256i c, int imm8)
Synopsis
__m256i _mm256_ternarylogic_epi32 (__m256i a, __m256i b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
Operation
FOR j := 0 to 7
i := j*32
FOR h := 0 to 31
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ENDFOR
dst[MAX:256] := 0
vpternlogd
__m512i _mm512_mask_ternarylogic_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b, int imm8)
Synopsis
__m512i _mm512_mask_ternarylogic_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
FOR h := 0 to 31
index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpternlogd
__m512i _mm512_maskz_ternarylogic_epi32 (__mmask16 k, __m512i a, __m512i b, __m512i c, int imm8)
Synopsis
__m512i _mm512_maskz_ternarylogic_epi32 (__mmask16 k, __m512i a, __m512i b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
FOR h := 0 to 31
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpternlogd
__m512i _mm512_ternarylogic_epi32 (__m512i a, __m512i b, __m512i c, int imm8)
Synopsis
__m512i _mm512_ternarylogic_epi32 (__m512i a, __m512i b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogd zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
Operation
FOR j := 0 to 15
i := j*32
FOR h := 0 to 31
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ENDFOR
dst[MAX:512] := 0
vpternlogq
__m128i _mm_mask_ternarylogic_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)
Synopsis
__m128i _mm_mask_ternarylogic_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
FOR h := 0 to 63
index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpternlogq
__m128i _mm_maskz_ternarylogic_epi64 (__mmask8 k, __m128i a, __m128i b, __m128i c, int imm8)
Synopsis
__m128i _mm_maskz_ternarylogic_epi64 (__mmask8 k, __m128i a, __m128i b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
FOR h := 0 to 63
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpternlogq
__m128i _mm_ternarylogic_epi64 (__m128i a, __m128i b, __m128i c, int imm8)
Synopsis
__m128i _mm_ternarylogic_epi64 (__m128i a, __m128i b, __m128i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
Operation
FOR j := 0 to 1
i := j*64
FOR h := 0 to 63
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ENDFOR
dst[MAX:128] := 0
vpternlogq
__m256i _mm256_mask_ternarylogic_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b, int imm8)
Synopsis
__m256i _mm256_mask_ternarylogic_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
FOR h := 0 to 63
index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpternlogq
__m256i _mm256_maskz_ternarylogic_epi64 (__mmask8 k, __m256i a, __m256i b, __m256i c, int imm8)
Synopsis
__m256i _mm256_maskz_ternarylogic_epi64 (__mmask8 k, __m256i a, __m256i b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
FOR h := 0 to 63
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpternlogq
__m256i _mm256_ternarylogic_epi64 (__m256i a, __m256i b, __m256i c, int imm8)
Synopsis
__m256i _mm256_ternarylogic_epi64 (__m256i a, __m256i b, __m256i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq
CPUID Flags: AVX512VL + AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
Operation
FOR j := 0 to 3
i := j*64
FOR h := 0 to 63
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ENDFOR
dst[MAX:256] := 0
vpternlogq
__m512i _mm512_mask_ternarylogic_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b, int imm8)
Synopsis
__m512i _mm512_mask_ternarylogic_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b, int imm8)
#include "immintrin.h"
Instruction: vpternlogq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
FOR h := 0 to 63
index[2:0] := (src[i+h] << 2) OR (a[i+h] << 1) OR b[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpternlogq
__m512i _mm512_maskz_ternarylogic_epi64 (__mmask8 k, __m512i a, __m512i b, __m512i c, int imm8)
Synopsis
__m512i _mm512_maskz_ternarylogic_epi64 (__mmask8 k, __m512i a, __m512i b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
FOR h := 0 to 63
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpternlogq
__m512i _mm512_ternarylogic_epi64 (__m512i a, __m512i b, __m512i c, int imm8)
Synopsis
__m512i _mm512_ternarylogic_epi64 (__m512i a, __m512i b, __m512i c, int imm8)
#include "immintrin.h"
Instruction: vpternlogq zmm {k}, zmm, zmm, imm
CPUID Flags: AVX512F
Description
Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
Operation
FOR j := 0 to 7
i := j*64
FOR h := 0 to 63
index[2:0] := (a[i+h] << 2) OR (b[i+h] << 1) OR c[i+h]
dst[i+h] := imm8[index[2:0]]
ENDFOR
ENDFOR
dst[MAX:512] := 0
...
int _mm_test_all_ones (__m128i a)
Synopsis
int _mm_test_all_ones (__m128i a)
#include "smmintrin.h"
Instruction: pcmpeqd xmm, xmm
ptest xmm, xmm
CPUID Flags: SSE4.1
Description
Compute the complement of a and 0xFFFFFFFF, and return 1 if the result is zero, otherwise return 0.
Operation
IF (a[127:0] AND NOT 0xFFFFFFFF == 0)
CF := 1
ELSE
CF := 0
FI
RETURN CF
ptest
int _mm_test_all_zeros (__m128i a, __m128i mask)
Synopsis
int _mm_test_all_zeros (__m128i a, __m128i mask)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1
Description
Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and return 1 if the result is zero, otherwise return 0.
Operation
IF (a[127:0] AND mask[127:0] == 0)
ZF := 1
ELSE
ZF := 0
FI
RETURN ZF
Performance
vptestmw
__mmask8 _mm_mask_test_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_test_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vptestmw
__mmask8 _mm_test_epi16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_test_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vptestmw
__mmask16 _mm256_mask_test_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_test_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vptestmw
__mmask16 _mm256_test_epi16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_test_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vptestmw
__mmask32 _mm512_mask_test_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_test_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512BW
Description
Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vptestmw
__mmask32 _mm512_test_epi16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_test_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmw
CPUID Flags: AVX512BW
Description
Compute the bitwise AND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vptestmd
__mmask8 _mm_mask_test_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_test_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vptestmd
__mmask8 _mm_test_epi32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_test_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vptestmd
__mmask8 _mm256_mask_test_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_test_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vptestmd
__mmask8 _mm256_test_epi32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_test_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vptestmd
__mmask16 _mm512_mask_test_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_test_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vptestmd
__mmask16 _mm512_test_epi32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_test_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmd k {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ((a[i+31:i] AND b[i+31:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vptestmq
__mmask8 _mm_mask_test_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_test_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vptestmq
__mmask8 _mm_test_epi64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_test_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vptestmq
__mmask8 _mm256_mask_test_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_test_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vptestmq
__mmask8 _mm256_test_epi64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_test_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vptestmq
__mmask8 _mm512_mask_test_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_test_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmq k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vptestmq
__mmask8 _mm512_test_epi64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_test_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmq k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ((a[i+63:i] AND b[i+63:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vptestmb
__mmask16 _mm_mask_test_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_test_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vptestmb
__mmask16 _mm_test_epi8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_test_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vptestmb
__mmask32 _mm256_mask_test_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_test_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vptestmb
__mmask32 _mm256_test_epi8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_test_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vptestmb
__mmask64 _mm512_mask_test_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_test_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512BW
Description
Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vptestmb
__mmask64 _mm512_test_epi8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_test_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestmb
CPUID Flags: AVX512BW
Description
Compute the bitwise AND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:64] := 0
ptest
int _mm_test_mix_ones_zeros (__m128i a, __m128i mask)
Synopsis
int _mm_test_mix_ones_zeros (__m128i a, __m128i mask)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1
Description
Compute the bitwise AND of 128 bits (representing integer data) in a and mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and mask, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
Operation
IF (a[127:0] AND mask[127:0] == 0)
ZF := 1
ELSE
ZF := 0
FI
IF (a[127:0] AND NOT mask[127:0] == 0)
CF := 1
ELSE
CF := 0
FI
IF (ZF == 0 && CF == 0)
RETURN 1
ELSE
RETURN 0
FI
Performance
vtestpd
int _mm_testc_pd (__m128d a, __m128d b)
Synopsis
int _mm_testc_pd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vtestpd xmm, xmm
CPUID Flags: AVX
Description
Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return the CF value.
Operation
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[63] == tmp[127] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[127:0] := a[127:0] AND NOT b[127:0]
IF (tmp[63] == tmp[127] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN CF
Performance
vtestpd
int _mm256_testc_pd (__m256d a, __m256d b)
Synopsis
int _mm256_testc_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vtestpd ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return the CF value.
Operation
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[255:0] := a[255:0] AND NOT b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN CF
Performance
vtestps
int _mm_testc_ps (__m128 a, __m128 b)
Synopsis
int _mm_testc_ps (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vtestps xmm, xmm
CPUID Flags: AVX
Description
Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return the CF value.
Operation
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[127:0] := a[127:0] AND NOT b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN CF
Performance
vtestps
int _mm256_testc_ps (__m256 a, __m256 b)
Synopsis
int _mm256_testc_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vtestps ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return the CF value.
Operation
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[255:0] := a[255:0] AND NOT b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN CF
Performance
ptest
int _mm_testc_si128 (__m128i a, __m128i b)
Synopsis
int _mm_testc_si128 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1
Description
Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the CF value.
Operation
IF (a[127:0] AND b[127:0] == 0)
ZF := 1
ELSE
ZF := 0
FI
IF (a[127:0] AND NOT b[127:0] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN CF
Performance
vptest
int _mm256_testc_si256 (__m256i a, __m256i b)
Synopsis
int _mm256_testc_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptest ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of 256 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the CF value.
Operation
IF (a[255:0] AND b[255:0] == 0)
ZF := 1
ELSE
ZF := 0
FI
IF (a[255:0] AND NOT b[255:0] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN CF
Performance
vptestnmw
__mmask8 _mm_mask_testn_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_testn_epi16_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 7
i := j*16
IF k1[j]
k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vptestnmw
__mmask8 _mm_testn_epi16_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_testn_epi16_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 7
i := j*16
k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vptestnmw
__mmask16 _mm256_mask_testn_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
Synopsis
__mmask16 _mm256_mask_testn_epi16_mask (__mmask16 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 15
i := j*16
IF k1[j]
k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vptestnmw
__mmask16 _mm256_testn_epi16_mask (__m256i a, __m256i b)
Synopsis
__mmask16 _mm256_testn_epi16_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 15
i := j*16
k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vptestnmw
__mmask32 _mm512_mask_testn_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
Synopsis
__mmask32 _mm512_mask_testn_epi16_mask (__mmask32 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512BW
Description
Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 31
i := j*16
IF k1[j]
k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vptestnmw
__mmask32 _mm512_testn_epi16_mask (__m512i a, __m512i b)
Synopsis
__mmask32 _mm512_testn_epi16_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmw
CPUID Flags: AVX512BW
Description
Compute the bitwise NAND of packed 16-bit integers in a and b, producing intermediate 16-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 31
i := j*16
k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vptestnmd
__mmask8 _mm_mask_testn_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_testn_epi32_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 3
i := j*32
IF k1[j]
k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vptestnmd
__mmask8 _mm_testn_epi32_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_testn_epi32_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 3
i := j*32
k[j] := ((a[i+31:i] NAND b[i+31:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vptestnmd
__mmask8 _mm256_mask_testn_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_testn_epi32_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 7
i := j*32
IF k1[j]
k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vptestnmd
__mmask8 _mm256_testn_epi32_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_testn_epi32_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmd
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 7
i := j*32
k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vptestnmd
__mmask16 _mm512_mask_testn_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
Synopsis
__mmask16 _mm512_mask_testn_epi32_mask (__mmask16 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmd k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 15
i := j*32
IF k1[j]
k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vptestnmd
__mmask16 _mm512_testn_epi32_mask (__m512i a, __m512i b)
Synopsis
__mmask16 _mm512_testn_epi32_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmd k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 15
i := j*32
k[j] := ((a[i+31:i] AND b[i+31:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vptestnmq
__mmask8 _mm_mask_testn_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
Synopsis
__mmask8 _mm_mask_testn_epi64_mask (__mmask8 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 1
i := j*64
IF k1[j]
k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:2] := 0
vptestnmq
__mmask8 _mm_testn_epi64_mask (__m128i a, __m128i b)
Synopsis
__mmask8 _mm_testn_epi64_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 1
i := j*64
k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:2] := 0
vptestnmq
__mmask8 _mm256_mask_testn_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
Synopsis
__mmask8 _mm256_mask_testn_epi64_mask (__mmask8 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 3
i := j*64
IF k1[j]
k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:4] := 0
vptestnmq
__mmask8 _mm256_testn_epi64_mask (__m256i a, __m256i b)
Synopsis
__mmask8 _mm256_testn_epi64_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 3
i := j*64
k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:4] := 0
vptestnmq
__mmask8 _mm512_mask_testn_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
Synopsis
__mmask8 _mm512_mask_testn_epi64_mask (__mmask8 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmq k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 7
i := j*64
IF k1[j]
k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:8] := 0
vptestnmq
__mmask8 _mm512_testn_epi64_mask (__m512i a, __m512i b)
Synopsis
__mmask8 _mm512_testn_epi64_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmq k {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 7
i := j*64
k[j] := ((a[i+63:i] AND b[i+63:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0
vptestnmb
__mmask16 _mm_mask_testn_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
Synopsis
__mmask16 _mm_mask_testn_epi8_mask (__mmask16 k1, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 15
i := j*8
IF k1[j]
k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:16] := 0
vptestnmb
__mmask16 _mm_testn_epi8_mask (__m128i a, __m128i b)
Synopsis
__mmask16 _mm_testn_epi8_mask (__m128i a, __m128i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 15
i := j*8
k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0
vptestnmb
__mmask32 _mm256_mask_testn_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
Synopsis
__mmask32 _mm256_mask_testn_epi8_mask (__mmask32 k1, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 31
i := j*8
IF k1[j]
k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:32] := 0
vptestnmb
__mmask32 _mm256_testn_epi8_mask (__m256i a, __m256i b)
Synopsis
__mmask32 _mm256_testn_epi8_mask (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512VL + AVX512BW
Description
Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 31
i := j*8
k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0
vptestnmb
__mmask64 _mm512_mask_testn_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
Synopsis
__mmask64 _mm512_mask_testn_epi8_mask (__mmask64 k1, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512BW
Description
Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
Operation
FOR j := 0 to 63
i := j*8
IF k1[j]
k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ELSE
k[j] := 0
FI
ENDFOR
k[MAX:64] := 0
vptestnmb
__mmask64 _mm512_testn_epi8_mask (__m512i a, __m512i b)
Synopsis
__mmask64 _mm512_testn_epi8_mask (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vptestnmb
CPUID Flags: AVX512BW
Description
Compute the bitwise NAND of packed 8-bit integers in a and b, producing intermediate 8-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
Operation
FOR j := 0 to 63
i := j*8
k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:64] := 0
vtestpd
int _mm_testnzc_pd (__m128d a, __m128d b)
Synopsis
int _mm_testnzc_pd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vtestpd xmm, xmm
CPUID Flags: AVX
Description
Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
Operation
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[63] == tmp[127] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[127:0] := a[127:0] AND NOT b[127:0]
IF (tmp[63] == tmp[127] == 0)
CF := 1
ELSE
CF := 0
FI
IF (ZF == 0 && CF == 0)
RETURN 1
ELSE
RETURN 0
FI
Performance
vtestpd
int _mm256_testnzc_pd (__m256d a, __m256d b)
Synopsis
int _mm256_testnzc_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vtestpd ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
Operation
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[255:0] := a[255:0] AND NOT b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
CF := 1
ELSE
CF := 0
FI
IF (ZF == 0 && CF == 0)
RETURN 1
ELSE
RETURN 0
FI
Performance
vtestps
int _mm_testnzc_ps (__m128 a, __m128 b)
Synopsis
int _mm_testnzc_ps (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vtestps xmm, xmm
CPUID Flags: AVX
Description
Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
Operation
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[127:0] := a[127:0] AND NOT b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
CF := 1
ELSE
CF := 0
FI
IF (ZF == 0 && CF == 0)
RETURN 1
ELSE
RETURN 0
FI
Performance
vtestps
int _mm256_testnzc_ps (__m256 a, __m256 b)
Synopsis
int _mm256_testnzc_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vtestps ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
Operation
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[255:0] := a[255:0] AND NOT b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
CF := 1
ELSE
CF := 0
FI
IF (ZF == 0 && CF == 0)
RETURN 1
ELSE
RETURN 0
FI
Performance
ptest
int _mm_testnzc_si128 (__m128i a, __m128i b)
Synopsis
int _mm_testnzc_si128 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1
Description
Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
Operation
IF (a[127:0] AND b[127:0] == 0)
ZF := 1
ELSE
ZF := 0
FI
IF (a[127:0] AND NOT b[127:0] == 0)
CF := 1
ELSE
CF := 0
FI
IF (ZF == 0 && CF == 0)
RETURN 1
ELSE
RETURN 0
FI
Performance
vptest
int _mm256_testnzc_si256 (__m256i a, __m256i b)
Synopsis
int _mm256_testnzc_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptest ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of 256 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, otherwise return 0.
Operation
IF (a[255:0] AND b[255:0] == 0)
ZF := 1
ELSE
ZF := 0
FI
IF (a[255:0] AND NOT b[255:0] == 0)
CF := 1
ELSE
CF := 0
FI
IF (ZF == 0 && CF == 0)
RETURN 1
ELSE
RETURN 0
FI
Performance
vtestpd
int _mm_testz_pd (__m128d a, __m128d b)
Synopsis
int _mm_testz_pd (__m128d a, __m128d b)
#include "immintrin.h"
Instruction: vtestpd xmm, xmm
CPUID Flags: AVX
Description
Compute the bitwise AND of 128 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return the ZF value.
Operation
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[63] == tmp[127] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[127:0] := a[127:0] AND NOT b[127:0]
IF (tmp[63] == tmp[127] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN ZF
Performance
vtestpd
int _mm256_testz_pd (__m256d a, __m256d b)
Synopsis
int _mm256_testz_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vtestpd ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of 256 bits (representing double-precision (64-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 64-bit element in the intermediate value is zero, otherwise set CF to 0. Return the ZF value.
Operation
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[255:0] := a[255:0] AND NOT b[255:0]
IF (tmp[63] == tmp[127] == tmp[191] == tmp[255] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN ZF
Performance
vtestps
int _mm_testz_ps (__m128 a, __m128 b)
Synopsis
int _mm_testz_ps (__m128 a, __m128 b)
#include "immintrin.h"
Instruction: vtestps xmm, xmm
CPUID Flags: AVX
Description
Compute the bitwise AND of 128 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 128-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return the ZF value.
Operation
tmp[127:0] := a[127:0] AND b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[127:0] := a[127:0] AND NOT b[127:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN ZF
Performance
vtestps
int _mm256_testz_ps (__m256 a, __m256 b)
Synopsis
int _mm256_testz_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vtestps ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of 256 bits (representing single-precision (32-bit) floating-point elements) in a and b, producing an intermediate 256-bit value, and set ZF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, producing an intermediate value, and set CF to 1 if the sign bit of each 32-bit element in the intermediate value is zero, otherwise set CF to 0. Return the ZF value.
Operation
tmp[255:0] := a[255:0] AND b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
ZF := 1
ELSE
ZF := 0
FI
tmp[255:0] := a[255:0] AND NOT b[255:0]
IF (tmp[31] == tmp[63] == tmp[95] == tmp[127] == tmp[159] == tmp[191] == tmp[223] == tmp[255] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN ZF
Performance
ptest
int _mm_testz_si128 (__m128i a, __m128i b)
Synopsis
int _mm_testz_si128 (__m128i a, __m128i b)
#include "smmintrin.h"
Instruction: ptest xmm, xmm
CPUID Flags: SSE4.1
Description
Compute the bitwise AND of 128 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the ZF value.
Operation
IF (a[127:0] AND b[127:0] == 0)
ZF := 1
ELSE
ZF := 0
FI
IF (a[127:0] AND NOT b[127:0] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN ZF
Performance
vptest
int _mm256_testz_si256 (__m256i a, __m256i b)
Synopsis
int _mm256_testz_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vptest ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise AND of 256 bits (representing integer data) in a and b, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the bitwise AND NOT of a and b, and set CF to 1 if the result is zero, otherwise set CF to 0. Return the ZF value.
Operation
IF (a[255:0] AND b[255:0] == 0)
ZF := 1
ELSE
ZF := 0
FI
IF (a[255:0] AND NOT b[255:0] == 0)
CF := 1
ELSE
CF := 0
FI
RETURN ZF
Performance
...
_MM_TRANSPOSE4_PS (__m128 row0, __m128 row1, __m128 row2, __m128 row3)
Synopsis
_MM_TRANSPOSE4_PS (__m128 row0, __m128 row1, __m128 row2, __m128 row3)
#include "xmmintrin.h"
CPUID Flags: SSE
Description
Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in row0, row1, row2, and row3, and store the transposed matrix in these vectors (row0 now contains column 0, etc.).
Operation
__m128 tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm_unpacklo_ps(row0, row1);
tmp2 = _mm_unpacklo_ps(row2, row3);
tmp1 = _mm_unpackhi_ps(row0, row1);
tmp3 = _mm_unpackhi_ps(row2, row3);
row0 = _mm_movelh_ps(tmp0, tmp2);
row1 = _mm_movehl_ps(tmp2, tmp0);
row2 = _mm_movelh_ps(tmp1, tmp3);
row3 = _mm_movehl_ps(tmp3, tmp1);
...
__m128d _mm_trunc_pd (__m128d a)
Synopsis
__m128d _mm_trunc_pd (__m128d a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Truncate the packed double-precision (64-bit) floating-point elements in a, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := TRUNCATE(a[i+63:i])
ENDFOR
dst[MAX:128] := 0
...
__m256d _mm256_trunc_pd (__m256d a)
Synopsis
__m256d _mm256_trunc_pd (__m256d a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Truncate the packed double-precision (64-bit) floating-point elements in a, and store the results as packed double-precision floating-point elements in dst. This intrinsic may generate the roundpd/vroundpd instruction.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := TRUNCATE(a[i+63:i])
ENDFOR
dst[MAX:256] := 0
...
__m512d _mm512_mask_trunc_pd (__m512d src, __mmask8 k, __m512d a)
Synopsis
__m512d _mm512_mask_trunc_pd (__m512d src, __mmask8 k, __m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Truncate the packed double-precision (64-bit) floating-point elements in a, and store the results as packed double-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := TRUNCATE(a[i+63:i])
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512d _mm512_trunc_pd (__m512d a)
Synopsis
__m512d _mm512_trunc_pd (__m512d a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Truncate the packed double-precision (64-bit) floating-point elements in a, and store the results as packed double-precision floating-point elements in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := TRUNCATE(a[i+63:i])
ENDFOR
dst[MAX:512] := 0
...
__m128 _mm_trunc_ps (__m128 a)
Synopsis
__m128 _mm_trunc_ps (__m128 a)
#include "immintrin.h"
CPUID Flags: SSE
Description
Truncate the packed single-precision (32-bit) floating-point elements in a, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := TRUNCATE(a[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256 _mm256_trunc_ps (__m256 a)
Synopsis
__m256 _mm256_trunc_ps (__m256 a)
#include "immintrin.h"
CPUID Flags: AVX
Description
Truncate the packed single-precision (32-bit) floating-point elements in a, and store the results as packed single-precision floating-point elements in dst. This intrinsic may generate the roundps/vroundps instruction.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := TRUNCATE(a[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m512 _mm512_mask_trunc_ps (__m512 src, __mmask16 k, __m512 a)
Synopsis
__m512 _mm512_mask_trunc_ps (__m512 src, __mmask16 k, __m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Truncate the packed single-precision (32-bit) floating-point elements in a, and store the results as packed single-precision floating-point elements in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := TRUNCATE(a[i+31:i])
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
...
__m512 _mm512_trunc_ps (__m512 a)
Synopsis
__m512 _mm512_trunc_ps (__m512 a)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Truncate the packed single-precision (32-bit) floating-point elements in a, and store the results as packed single-precision floating-point elements in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := TRUNCATE(a[i+31:i])
ENDFOR
dst[MAX:512] := 0
tzcnt
int _mm_tzcnt_32 (unsigned int a)
Synopsis
int _mm_tzcnt_32 (unsigned int a)
#include "immintrin.h"
Instruction: tzcnt r32, r32
CPUID Flags: BMI1
Description
Count the number of trailing zero bits in unsigned 32-bit integer a, and return that count in dst.
Operation
tmp := 0
dst := 0
DO WHILE ((tmp < 32) AND a[tmp] = 0)
tmp := tmp + 1
dst := dst + 1
OD
tzcnt
__int64 _mm_tzcnt_64 (unsigned __int64 a)
Synopsis
__int64 _mm_tzcnt_64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: tzcnt r64, r64
CPUID Flags: BMI1
Description
Count the number of trailing zero bits in unsigned 64-bit integer a, and return that count in dst.
Operation
tmp := 0
dst := 0
DO WHILE ((tmp < 64) AND a[tmp] = 0)
tmp := tmp + 1
dst := dst + 1
OD
tzcnt
unsigned int _tzcnt_u32 (unsigned int a)
Synopsis
unsigned int _tzcnt_u32 (unsigned int a)
#include "immintrin.h"
Instruction: tzcnt r32, r32
CPUID Flags: BMI1
Description
Count the number of trailing zero bits in unsigned 32-bit integer a, and return that count in dst.
Operation
tmp := 0
dst := 0
DO WHILE ((tmp < 32) AND a[tmp] = 0)
tmp := tmp + 1
dst := dst + 1
OD
Performance
tzcnt
unsigned __int64 _tzcnt_u64 (unsigned __int64 a)
Synopsis
unsigned __int64 _tzcnt_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: tzcnt r64, r64
CPUID Flags: BMI1
Description
Count the number of trailing zero bits in unsigned 64-bit integer a, and return that count in dst.
Operation
tmp := 0
dst := 0
DO WHILE ((tmp < 64) AND a[tmp] = 0)
tmp := tmp + 1
dst := dst + 1
OD
Performance
tzcnti
int _mm_tzcnti_32 (int a, unsigned int x)
Synopsis
int _mm_tzcnti_32 (int a, unsigned int x)
#include "immintrin.h"
Instruction: tzcnti r32, r32
CPUID Flags: KNCNI
Description
Counts the number of trailing bits in unsigned 32-bit integer x starting at bit a storing the result in dst.
Operation
count := 0
FOR j := a to 31
IF NOT(x[j] 1)
count := count + 1
FI
ENDFOR
dst := count
tzcnti
__int64 _mm_tzcnti_64 (__int64 a, unsigned __int64 x)
Synopsis
__int64 _mm_tzcnti_64 (__int64 a, unsigned __int64 x)
#include "immintrin.h"
Instruction: tzcnti r64, r64
CPUID Flags: KNCNI
Description
Counts the number of trailing bits in unsigned 64-bit integer x starting at bit a storing the result in dst.
Operation
count := 0
FOR j := a to 63
IF NOT(x[j] 1)
count := count + 1
FI
ENDFOR
dst := count
ucomisd
int _mm_ucomieq_sd (__m128d a, __m128d b)
Synopsis
int _mm_ucomieq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[63:0] == b[63:0] ) ? 1 : 0
Performance
ucomiss
int _mm_ucomieq_ss (__m128 a, __m128 b)
Synopsis
int _mm_ucomieq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[31:0] == b[31:0] ) ? 1 : 0
Performance
ucomisd
int _mm_ucomige_sd (__m128d a, __m128d b)
Synopsis
int _mm_ucomige_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[63:0] >= b[63:0] ) ? 1 : 0
Performance
ucomiss
int _mm_ucomige_ss (__m128 a, __m128 b)
Synopsis
int _mm_ucomige_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0
Performance
ucomisd
int _mm_ucomigt_sd (__m128d a, __m128d b)
Synopsis
int _mm_ucomigt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[63:0] > b[63:0] ) ? 1 : 0
Performance
ucomiss
int _mm_ucomigt_ss (__m128 a, __m128 b)
Synopsis
int _mm_ucomigt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[31:0] > b[31:0] ) ? 1 : 0
Performance
ucomisd
int _mm_ucomile_sd (__m128d a, __m128d b)
Synopsis
int _mm_ucomile_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[63:0] <= b[63:0] ) ? 1 : 0
Performance
ucomiss
int _mm_ucomile_ss (__m128 a, __m128 b)
Synopsis
int _mm_ucomile_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0
Performance
ucomisd
int _mm_ucomilt_sd (__m128d a, __m128d b)
Synopsis
int _mm_ucomilt_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[63:0] < b[63:0] ) ? 1 : 0
Performance
ucomiss
int _mm_ucomilt_ss (__m128 a, __m128 b)
Synopsis
int _mm_ucomilt_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[31:0] < b[31:0] ) ? 1 : 0
Performance
ucomisd
int _mm_ucomineq_sd (__m128d a, __m128d b)
Synopsis
int _mm_ucomineq_sd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: ucomisd xmm, xmm
CPUID Flags: SSE2
Description
Compare the lower double-precision (64-bit) floating-point element in a and b for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[63:0] != b[63:0] ) ? 1 : 0
Performance
ucomiss
int _mm_ucomineq_ss (__m128 a, __m128 b)
Synopsis
int _mm_ucomineq_ss (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: ucomiss xmm, xmm
CPUID Flags: SSE
Description
Compare the lower single-precision (32-bit) floating-point element in a and b for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
Operation
RETURN ( a[31:0] != b[31:0] ) ? 1 : 0
Performance
...
__m128i _mm_udiv_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_udiv_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_udiv_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_udiv_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the truncated results in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
...
__m128i _mm_udivrem_epi32 (__m128i * mem_addr, __m128i a, __m128i b)
Synopsis
__m128i _mm_udivrem_epi32 (__m128i * mem_addr, __m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, store the truncated results in dst, and store the remainders as packed unsigned 32-bit integers into memory at mem_addr.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_udivrem_epi32 (__m256i * mem_addr, __m256i a, __m256i b)
Synopsis
__m256i _mm256_udivrem_epi32 (__m256i * mem_addr, __m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, store the truncated results in dst, and store the remainders as packed unsigned 32-bit integers into memory at mem_addr.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
__m512 _mm512_undefined (void)
Synopsis
__m512 _mm512_undefined (void)
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Return vector of type __m512 with undefined elements.
__m512i _mm512_undefined_epi32 ()
Synopsis
__m512i _mm512_undefined_epi32 ()
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Return vector of type __m512i with undefined elements.
__m128d _mm_undefined_pd (void)
Synopsis
__m128d _mm_undefined_pd (void)
#include "immintrin.h"
CPUID Flags: AVX
Description
Return vector of type __m128d with undefined elements.
__m256d _mm256_undefined_pd (void)
Synopsis
__m256d _mm256_undefined_pd (void)
#include "immintrin.h"
CPUID Flags: AVX
Description
Return vector of type __m256d with undefined elements.
__m512d _mm512_undefined_pd ()
Synopsis
__m512d _mm512_undefined_pd ()
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Return vector of type __m512d with undefined elements.
__m128 _mm_undefined_ps (void)
Synopsis
__m128 _mm_undefined_ps (void)
#include "immintrin.h"
CPUID Flags: AVX
Description
Return vector of type __m128 with undefined elements.
__m256 _mm256_undefined_ps (void)
Synopsis
__m256 _mm256_undefined_ps (void)
#include "immintrin.h"
CPUID Flags: AVX
Description
Return vector of type __m256 with undefined elements.
__m512 _mm512_undefined_ps ()
Synopsis
__m512 _mm512_undefined_ps ()
#include "immintrin.h"
CPUID Flags: AVX512F
Description
Return vector of type __m512 with undefined elements.
__m128i _mm_undefined_si128 (void)
Synopsis
__m128i _mm_undefined_si128 (void)
#include "immintrin.h"
CPUID Flags: AVX
Description
Return vector of type __m128i with undefined elements.
__m256i _mm256_undefined_si256 (void)
Synopsis
__m256i _mm256_undefined_si256 (void)
#include "immintrin.h"
CPUID Flags: AVX
Description
Return vector of type __m256i with undefined elements.
vpunpckhwd
__m128i _mm_mask_unpackhi_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_unpackhi_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[79:64]
dst[31:16] := src2[79:64]
dst[47:32] := src1[95:80]
dst[63:48] := src2[95:80]
dst[79:64] := src1[111:96]
dst[95:80] := src2[111:96]
dst[111:96] := src1[127:112]
dst[127:112] := src2[127:112]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpunpckhwd
__m128i _mm_maskz_unpackhi_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_unpackhi_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[79:64]
dst[31:16] := src2[79:64]
dst[47:32] := src1[95:80]
dst[63:48] := src2[95:80]
dst[79:64] := src1[111:96]
dst[95:80] := src2[111:96]
dst[111:96] := src1[127:112]
dst[127:112] := src2[127:112]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
punpckhwd
__m128i _mm_unpackhi_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_unpackhi_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckhwd xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave 16-bit integers from the high half of a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[79:64]
dst[31:16] := src2[79:64]
dst[47:32] := src1[95:80]
dst[63:48] := src2[95:80]
dst[79:64] := src1[111:96]
dst[95:80] := src2[111:96]
dst[111:96] := src1[127:112]
dst[127:112] := src2[127:112]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
Performance
vpunpckhwd
__m256i _mm256_mask_unpackhi_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_unpackhi_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[79:64]
dst[31:16] := src2[79:64]
dst[47:32] := src1[95:80]
dst[63:48] := src2[95:80]
dst[79:64] := src1[111:96]
dst[95:80] := src2[111:96]
dst[111:96] := src1[127:112]
dst[127:112] := src2[127:112]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpunpckhwd
__m256i _mm256_maskz_unpackhi_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_unpackhi_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[79:64]
dst[31:16] := src2[79:64]
dst[47:32] := src1[95:80]
dst[63:48] := src2[95:80]
dst[79:64] := src1[111:96]
dst[95:80] := src2[111:96]
dst[111:96] := src1[127:112]
dst[127:112] := src2[127:112]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpunpckhwd
__m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_unpackhi_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhwd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[79:64]
dst[31:16] := src2[79:64]
dst[47:32] := src1[95:80]
dst[63:48] := src2[95:80]
dst[79:64] := src1[111:96]
dst[95:80] := src2[111:96]
dst[111:96] := src1[127:112]
dst[127:112] := src2[127:112]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vpunpckhwd
__m512i _mm512_mask_unpackhi_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_unpackhi_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512BW
Description
Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[79:64]
dst[31:16] := src2[79:64]
dst[47:32] := src1[95:80]
dst[63:48] := src2[95:80]
dst[79:64] := src1[111:96]
dst[95:80] := src2[111:96]
dst[111:96] := src1[127:112]
dst[127:112] := src2[127:112]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpunpckhwd
__m512i _mm512_maskz_unpackhi_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_unpackhi_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512BW
Description
Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[79:64]
dst[31:16] := src2[79:64]
dst[47:32] := src1[95:80]
dst[63:48] := src2[95:80]
dst[79:64] := src1[111:96]
dst[95:80] := src2[111:96]
dst[111:96] := src1[127:112]
dst[127:112] := src2[127:112]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpunpckhwd
__m512i _mm512_unpackhi_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_unpackhi_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhwd
CPUID Flags: AVX512BW
Description
Unpack and interleave 16-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[79:64]
dst[31:16] := src2[79:64]
dst[47:32] := src1[95:80]
dst[63:48] := src2[95:80]
dst[79:64] := src1[111:96]
dst[95:80] := src2[111:96]
dst[111:96] := src1[127:112]
dst[127:112] := src2[127:112]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
vpunpckhdq
__m128i _mm_mask_unpackhi_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_unpackhi_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpunpckhdq
__m128i _mm_maskz_unpackhi_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_unpackhi_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
punpckhdq
__m128i _mm_unpackhi_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_unpackhi_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckhdq xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave 32-bit integers from the high half of a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
Performance
vpunpckhdq
__m256i _mm256_mask_unpackhi_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_unpackhi_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpunpckhdq
__m256i _mm256_maskz_unpackhi_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_unpackhi_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpunpckhdq
__m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_unpackhi_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhdq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vpunpckhdq
__m512i _mm512_mask_unpackhi_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_unpackhi_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpunpckhdq
__m512i _mm512_maskz_unpackhi_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_unpackhi_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpunpckhdq
__m512i _mm512_unpackhi_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_unpackhi_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
vpunpckhqdq
__m128i _mm_mask_unpackhi_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_unpackhi_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhqdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpunpckhqdq
__m128i _mm_maskz_unpackhi_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_unpackhi_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhqdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
punpckhqdq
__m128i _mm_unpackhi_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_unpackhi_epi64 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckhqdq xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave 64-bit integers from the high half of a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
Performance
vpunpckhqdq
__m256i _mm256_mask_unpackhi_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_unpackhi_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhqdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpunpckhqdq
__m256i _mm256_maskz_unpackhi_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_unpackhi_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhqdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpunpckhqdq
__m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_unpackhi_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhqdq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vpunpckhqdq
__m512i _mm512_mask_unpackhi_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_unpackhi_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpunpckhqdq
__m512i _mm512_maskz_unpackhi_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_unpackhi_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpunpckhqdq
__m512i _mm512_unpackhi_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_unpackhi_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
vpunpckhbw
__m128i _mm_mask_unpackhi_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_unpackhi_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 8-bit integers from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpunpckhbw
__m128i _mm_maskz_unpackhi_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_unpackhi_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 8-bit integers from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
punpckhbw
__m128i _mm_unpackhi_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_unpackhi_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckhbw xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave 8-bit integers from the high half of a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
Performance
vpunpckhbw
__m256i _mm256_mask_unpackhi_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_unpackhi_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpunpckhbw
__m256i _mm256_maskz_unpackhi_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_unpackhi_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpunpckhbw
__m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_unpackhi_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckhbw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vpunpckhbw
__m512i _mm512_mask_unpackhi_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_unpackhi_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512BW
Description
Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpunpckhbw
__m512i _mm512_maskz_unpackhi_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_unpackhi_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512BW
Description
Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpunpckhbw
__m512i _mm512_unpackhi_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_unpackhi_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckhbw
CPUID Flags: AVX512BW
Description
Unpack and interleave 8-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[71:64]
dst[15:8] := src2[71:64]
dst[23:16] := src1[79:72]
dst[31:24] := src2[79:72]
dst[39:32] := src1[87:80]
dst[47:40] := src2[87:80]
dst[55:48] := src1[95:88]
dst[63:56] := src2[95:88]
dst[71:64] := src1[103:96]
dst[79:72] := src2[103:96]
dst[87:80] := src1[111:104]
dst[95:88] := src2[111:104]
dst[103:96] := src1[119:112]
dst[111:104] := src2[119:112]
dst[119:112] := src1[127:120]
dst[127:120] := src2[127:120]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
dst[MAX:512] := 0
vunpckhpd
__m128d _mm_mask_unpackhi_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_unpackhi_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vunpckhpd
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vunpckhpd
__m128d _mm_maskz_unpackhi_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_unpackhi_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vunpckhpd
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
unpckhpd
__m128d _mm_unpackhi_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_unpackhi_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: unpckhpd xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the high half of a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
Performance
vunpckhpd
__m256d _mm256_mask_unpackhi_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_unpackhi_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpckhpd
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vunpckhpd
__m256d _mm256_maskz_unpackhi_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_unpackhi_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpckhpd
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vunpckhpd
__m256d _mm256_unpackhi_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_unpackhi_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpckhpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vunpckhpd
__m512d _mm512_mask_unpackhi_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_unpackhi_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpckhpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vunpckhpd
__m512d _mm512_maskz_unpackhi_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_unpackhi_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpckhpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vunpckhpd
__m512d _mm512_unpackhi_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_unpackhi_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpckhpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[127:64]
dst[127:64] := src2[127:64]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
vunpckhps
__m128 _mm_mask_unpackhi_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_unpackhi_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vunpckhps
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the high half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vunpckhps
__m128 _mm_maskz_unpackhi_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_unpackhi_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vunpckhps
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the high half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
unpckhps
__m128 _mm_unpackhi_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_unpackhi_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: unpckhps xmm, xmm
CPUID Flags: SSE
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the high half a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
Performance
vunpckhps
__m256 _mm256_mask_unpackhi_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_unpackhi_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpckhps
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vunpckhps
__m256 _mm256_maskz_unpackhi_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_unpackhi_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpckhps
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vunpckhps
__m256 _mm256_unpackhi_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_unpackhi_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpckhps ymm, ymm, ymm
CPUID Flags: AVX
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vunpckhps
__m512 _mm512_mask_unpackhi_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_unpackhi_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpckhps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vunpckhps
__m512 _mm512_maskz_unpackhi_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_unpackhi_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpckhps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vunpckhps
__m512 _mm512_unpackhi_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_unpackhi_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpckhps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[95:64]
dst[63:32] := src2[95:64]
dst[95:64] := src1[127:96]
dst[127:96] := src2[127:96]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
vpunpcklwd
__m128i _mm_mask_unpacklo_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_unpacklo_epi16 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[15:0]
dst[31:16] := src2[15:0]
dst[47:32] := src1[31:16]
dst[63:48] := src2[31:16]
dst[79:64] := src1[47:32]
dst[95:80] := src2[47:32]
dst[111:96] := src1[63:48]
dst[127:112] := src2[63:48]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:128] := 0
vpunpcklwd
__m128i _mm_maskz_unpacklo_epi16 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_unpacklo_epi16 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[15:0]
dst[31:16] := src2[15:0]
dst[47:32] := src1[31:16]
dst[63:48] := src2[31:16]
dst[79:64] := src1[47:32]
dst[95:80] := src2[47:32]
dst[111:96] := src1[63:48]
dst[127:112] := src2[63:48]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
FOR j := 0 to 7
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
punpcklwd
__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b)
Synopsis
__m128i _mm_unpacklo_epi16 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpcklwd xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave 16-bit integers from the low half of a and b, and store the results in dst.
Operation
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[15:0]
dst[31:16] := src2[15:0]
dst[47:32] := src1[31:16]
dst[63:48] := src2[31:16]
dst[79:64] := src1[47:32]
dst[95:80] := src2[47:32]
dst[111:96] := src1[63:48]
dst[127:112] := src2[63:48]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
Performance
vpunpcklwd
__m256i _mm256_mask_unpacklo_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_unpacklo_epi16 (__m256i src, __mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[15:0]
dst[31:16] := src2[15:0]
dst[47:32] := src1[31:16]
dst[63:48] := src2[31:16]
dst[79:64] := src1[47:32]
dst[95:80] := src2[47:32]
dst[111:96] := src1[63:48]
dst[127:112] := src2[63:48]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:256] := 0
vpunpcklwd
__m256i _mm256_maskz_unpacklo_epi16 (__mmask16 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_unpacklo_epi16 (__mmask16 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[15:0]
dst[31:16] := src2[15:0]
dst[47:32] := src1[31:16]
dst[63:48] := src2[31:16]
dst[79:64] := src1[47:32]
dst[95:80] := src2[47:32]
dst[111:96] := src1[63:48]
dst[127:112] := src2[63:48]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
FOR j := 0 to 15
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpunpcklwd
__m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_unpacklo_epi16 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklwd ymm, ymm, ymm
CPUID Flags: AVX2
Description
Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[15:0]
dst[31:16] := src2[15:0]
dst[47:32] := src1[31:16]
dst[63:48] := src2[31:16]
dst[79:64] := src1[47:32]
dst[95:80] := src2[47:32]
dst[111:96] := src1[63:48]
dst[127:112] := src2[63:48]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vpunpcklwd
__m512i _mm512_mask_unpacklo_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_unpacklo_epi16 (__m512i src, __mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512BW
Description
Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[15:0]
dst[31:16] := src2[15:0]
dst[47:32] := src1[31:16]
dst[63:48] := src2[31:16]
dst[79:64] := src1[47:32]
dst[95:80] := src2[47:32]
dst[111:96] := src1[63:48]
dst[127:112] := src2[63:48]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := src[i+15:i]
FI
ENDFOR
dst[MAX:512] := 0
vpunpcklwd
__m512i _mm512_maskz_unpacklo_epi16 (__mmask32 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_unpacklo_epi16 (__mmask32 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512BW
Description
Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[15:0]
dst[31:16] := src2[15:0]
dst[47:32] := src1[31:16]
dst[63:48] := src2[31:16]
dst[79:64] := src1[47:32]
dst[95:80] := src2[47:32]
dst[111:96] := src1[63:48]
dst[127:112] := src2[63:48]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
FOR j := 0 to 31
i := j*16
IF k[j]
dst[i+15:i] := tmp_dst[i+15:i]
ELSE
dst[i+15:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpunpcklwd
__m512i _mm512_unpacklo_epi16 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_unpacklo_epi16 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklwd
CPUID Flags: AVX512BW
Description
Unpack and interleave 16-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
dst[15:0] := src1[15:0]
dst[31:16] := src2[15:0]
dst[47:32] := src1[31:16]
dst[63:48] := src2[31:16]
dst[79:64] := src1[47:32]
dst[95:80] := src2[47:32]
dst[111:96] := src1[63:48]
dst[127:112] := src2[63:48]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
vpunpckldq
__m128i _mm_mask_unpacklo_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_unpacklo_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckldq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpunpckldq
__m128i _mm_maskz_unpacklo_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_unpacklo_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpckldq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
punpckldq
__m128i _mm_unpacklo_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_unpacklo_epi32 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpckldq xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave 32-bit integers from the low half of a and b, and store the results in dst.
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
Performance
vpunpckldq
__m256i _mm256_mask_unpacklo_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_unpacklo_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckldq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpunpckldq
__m256i _mm256_maskz_unpacklo_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_unpacklo_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckldq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpunpckldq
__m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_unpacklo_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpckldq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vpunpckldq
__m512i _mm512_mask_unpacklo_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_unpacklo_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpunpckldq
__m512i _mm512_maskz_unpacklo_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_unpacklo_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpunpckldq
__m512i _mm512_unpacklo_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_unpacklo_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpckldq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
vpunpcklqdq
__m128i _mm_mask_unpacklo_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_unpacklo_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklqdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpunpcklqdq
__m128i _mm_maskz_unpacklo_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_unpacklo_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklqdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
punpcklqdq
__m128i _mm_unpacklo_epi64 (__m128i a, __m128i b)
Synopsis
__m128i _mm_unpacklo_epi64 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpcklqdq xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave 64-bit integers from the low half of a and b, and store the results in dst.
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
Performance
vpunpcklqdq
__m256i _mm256_mask_unpacklo_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_unpacklo_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklqdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpunpcklqdq
__m256i _mm256_maskz_unpacklo_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_unpacklo_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklqdq
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpunpcklqdq
__m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_unpacklo_epi64 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklqdq ymm, ymm, ymm
CPUID Flags: AVX2
Description
Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vpunpcklqdq
__m512i _mm512_mask_unpacklo_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_unpacklo_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpunpcklqdq
__m512i _mm512_maskz_unpacklo_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_unpacklo_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpunpcklqdq
__m512i _mm512_unpacklo_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_unpacklo_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklqdq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
vpunpcklbw
__m128i _mm_mask_unpacklo_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_unpacklo_epi8 (__m128i src, __mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 8-bit integers from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:128] := 0
vpunpcklbw
__m128i _mm_maskz_unpacklo_epi8 (__mmask16 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_unpacklo_epi8 (__mmask16 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 8-bit integers from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
FOR j := 0 to 15
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
punpcklbw
__m128i _mm_unpacklo_epi8 (__m128i a, __m128i b)
Synopsis
__m128i _mm_unpacklo_epi8 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: punpcklbw xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave 8-bit integers from the low half of a and b, and store the results in dst.
Operation
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
Performance
vpunpcklbw
__m256i _mm256_mask_unpacklo_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_unpacklo_epi8 (__m256i src, __mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:256] := 0
vpunpcklbw
__m256i _mm256_maskz_unpacklo_epi8 (__mmask32 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_unpacklo_epi8 (__mmask32 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512VL + AVX512BW
Description
Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
FOR j := 0 to 31
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpunpcklbw
__m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_unpacklo_epi8 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpunpcklbw ymm, ymm, ymm
CPUID Flags: AVX2
Description
Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vpunpcklbw
__m512i _mm512_mask_unpacklo_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_unpacklo_epi8 (__m512i src, __mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512BW
Description
Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := src[i+7:i]
FI
ENDFOR
dst[MAX:512] := 0
vpunpcklbw
__m512i _mm512_maskz_unpacklo_epi8 (__mmask64 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_unpacklo_epi8 (__mmask64 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512BW
Description
Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
FOR j := 0 to 63
i := j*8
IF k[j]
dst[i+7:i] := tmp_dst[i+7:i]
ELSE
dst[i+7:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpunpcklbw
__m512i _mm512_unpacklo_epi8 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_unpacklo_epi8 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpunpcklbw
CPUID Flags: AVX512BW
Description
Unpack and interleave 8-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
dst[7:0] := src1[7:0]
dst[15:8] := src2[7:0]
dst[23:16] := src1[15:8]
dst[31:24] := src2[15:8]
dst[39:32] := src1[23:16]
dst[47:40] := src2[23:16]
dst[55:48] := src1[31:24]
dst[63:56] := src2[31:24]
dst[71:64] := src1[39:32]
dst[79:72] := src2[39:32]
dst[87:80] := src1[47:40]
dst[95:88] := src2[47:40]
dst[103:96] := src1[55:48]
dst[111:104] := src2[55:48]
dst[119:112] := src1[63:56]
dst[127:120] := src2[63:56]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
dst[MAX:512] := 0
vunpcklpd
__m128d _mm_mask_unpacklo_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_unpacklo_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vunpcklpd
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vunpcklpd
__m128d _mm_maskz_unpacklo_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_unpacklo_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vunpcklpd
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
unpcklpd
__m128d _mm_unpacklo_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_unpacklo_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: unpcklpd xmm, xmm
CPUID Flags: SSE2
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the low half of a and b, and store the results in dst.
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
Performance
vunpcklpd
__m256d _mm256_mask_unpacklo_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_unpacklo_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpcklpd
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vunpcklpd
__m256d _mm256_maskz_unpacklo_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_unpacklo_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpcklpd
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vunpcklpd
__m256d _mm256_unpacklo_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_unpacklo_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vunpcklpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vunpcklpd
__m512d _mm512_mask_unpacklo_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_unpacklo_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpcklpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vunpcklpd
__m512d _mm512_maskz_unpacklo_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_unpacklo_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpcklpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := tmp_dst[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vunpcklpd
__m512d _mm512_unpacklo_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_unpacklo_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vunpcklpd zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
dst[63:0] := src1[63:0]
dst[127:64] := src2[63:0]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_QWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_QWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
vunpcklps
__m128 _mm_mask_unpacklo_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_unpacklo_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vunpcklps
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the low half of a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vunpcklps
__m128 _mm_maskz_unpacklo_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_unpacklo_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vunpcklps
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the low half of a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
unpcklps
__m128 _mm_unpacklo_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_unpacklo_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: unpcklps xmm, xmm
CPUID Flags: SSE
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the low half of a and b, and store the results in dst.
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
Performance
vunpcklps
__m256 _mm256_mask_unpacklo_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_unpacklo_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpcklps
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vunpcklps
__m256 _mm256_maskz_unpacklo_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_unpacklo_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpcklps
CPUID Flags: AVX512VL + AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vunpcklps
__m256 _mm256_unpacklo_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_unpacklo_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vunpcklps ymm, ymm, ymm
CPUID Flags: AVX
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0
Performance
vunpcklps
__m512 _mm512_mask_unpacklo_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_unpacklo_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpcklps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vunpcklps
__m512 _mm512_maskz_unpacklo_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_unpacklo_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpcklps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
tmp_dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := tmp_dst[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vunpcklps
__m512 _mm512_unpacklo_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_unpacklo_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vunpcklps zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
Operation
INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
dst[31:0] := src1[31:0]
dst[63:32] := src2[31:0]
dst[95:64] := src1[63:32]
dst[127:96] := src2[63:32]
RETURN dst[127:0]
}
dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_DWORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_DWORDS(a[511:384], b[511:384])
dst[MAX:512] := 0
...
__m128i _mm_urem_epi32 (__m128i a, __m128i b)
Synopsis
__m128i _mm_urem_epi32 (__m128i a, __m128i b)
#include "immintrin.h"
CPUID Flags: SSE
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 3
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0
...
__m256i _mm256_urem_epi32 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_urem_epi32 (__m256i a, __m256i b)
#include "immintrin.h"
CPUID Flags: AVX
Description
Divide packed unsigned 32-bit integers in a by packed elements in b, and store the remainders as packed unsigned 32-bit integers in dst.
Operation
FOR j := 0 to 7
i := 32*j
dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:256] := 0
wrfsbase
void _writefsbase_u32 (unsigned int a)
Synopsis
void _writefsbase_u32 (unsigned int a)
#include "immintrin.h"
Instruction: wrfsbase r32
CPUID Flags: FSGSBASE
Description
Write the unsigned 32-bit integer a to the FS segment base register.
Operation
FS_Segment_Base_Register[31:0] := a[31:0];
FS_Segment_Base_Register[63:32] := 0
wrfsbase
void _writefsbase_u64 (unsigned __int64 a)
Synopsis
void _writefsbase_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: wrfsbase r64
CPUID Flags: FSGSBASE
Description
Write the unsigned 64-bit integer a to the FS segment base register.
Operation
FS_Segment_Base_Register[63:0] := a[63:0];
wrgsbase
void _writegsbase_u32 (unsigned int a)
Synopsis
void _writegsbase_u32 (unsigned int a)
#include "immintrin.h"
Instruction: wrgsbase r32
CPUID Flags: FSGSBASE
Description
Write the unsigned 32-bit integer a to the GS segment base register.
Operation
GS_Segment_Base_Register[31:0] := a[31:0];
GS_Segment_Base_Register[63:32] := 0
wrgsbase
void _writegsbase_u64 (unsigned __int64 a)
Synopsis
void _writegsbase_u64 (unsigned __int64 a)
#include "immintrin.h"
Instruction: wrgsbase r32
CPUID Flags: FSGSBASE
Description
Write the unsigned 64-bit integer a to the GS segment base register.
Operation
GS_Segment_Base_Register[63:0] := a[63:0];
xabort
void _xabort (const unsigned int imm8)
Synopsis
void _xabort (const unsigned int imm8)
#include "immintrin.h"
Instruction: xabort imm
CPUID Flags: RTM
Description
Force an RTM abort. The EAX register is updated to reflect an XABORT instruction caused the abort, and the imm8 parameter will be provided in bits [31:24] of EAX.
Following an RTM abort, the logical processor resumes execution at the fallback address computed through the outermost XBEGIN instruction.
Operation
IF RTM_ACTIVE = 0
// nop
ELSE
// restore architectural register state
// discard memory updates performed in transaction
// update EAX with status and imm8 value
RTM_NEST_COUNT := 0
RTM_ACTIVE := 0
IF 64-bit Mode
RIP := fallbackRIP
ELSE
EIP := fallbackEIP
FI
FI
xbegin
unsigned int _xbegin (void)
Synopsis
unsigned int _xbegin (void)
#include "immintrin.h"
Instruction: xbegin
CPUID Flags: RTM
Description
Specify the start of an RTM code region.
If the logical processor was not already in transactional execution, then this call causes the logical processor to transition into transactional execution.
On an RTM abort, the logical processor discards all architectural register and memory updates performed during the RTM execution, restores architectural state, and starts execution beginning at the fallback address computed from the outermost XBEGIN instruction.
Operation
IF RTM_NEST_COUNT < MAX_RTM_NEST_COUNT
RTM_NEST_COUNT := RTM_NEST_COUNT + 1
IF RTM_NEST_COUNT = 1
IF 64-bit Mode
fallbackRIP := RIP + SignExtend(IMM)
ELSE IF 32-bit Mode
fallbackEIP := EIP + SignExtend(IMM)
ELSE // 16-bit Mode
fallbackEIP := (EIP + SignExtend(IMM)) AND 0x0000FFFF
FI
RTM_ACTIVE := 1
// enter RTM execution, record register state, start tracking memory state
FI
ELSE
// RTM abort (see _xabort)
FI
xend
void _xend (void)
Synopsis
void _xend (void)
#include "immintrin.h"
Instruction: xend
CPUID Flags: RTM
Description
Specify the end of an RTM code region.
If this corresponds to the outermost scope, the logical processor will attempt to commit the logical processor state atomically.
If the commit fails, the logical processor will perform an RTM abort.
Operation
IF RTM_ACTIVE = 1
RTM_NEST_COUNT := RTM_NEST_COUNT - 1
IF RTM_NEST_COUNT = 0
// try to commit transaction
IF fail to commit transaction
// RTM abort (see _xabort)
ELSE
RTM_ACTIVE = 0
FI
FI
FI
xgetbv
unsigned __int64 _xgetbv (unsigned int a)
Synopsis
unsigned __int64 _xgetbv (unsigned int a)
#include "immintrin.h"
Instruction: xgetbv
CPUID Flags: XSAVE
Description
Copy up to 64-bits from the value of the extended control register (XCR) specified by a into dst. Currently only XFEATURE_ENABLED_MASK XCR is supported.
Operation
dst[63:0] := XCR[a]
vpxord
__m128i _mm_mask_xor_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_xor_epi32 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpxord
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vpxord
__m128i _mm_maskz_xor_epi32 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_xor_epi32 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpxord
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpxord
__m256i _mm256_mask_xor_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_xor_epi32 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxord
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vpxord
__m256i _mm256_maskz_xor_epi32 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_xor_epi32 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxord
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpxord
__m512i _mm512_mask_xor_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_xor_epi32 (__m512i src, __mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vpxord
__m512i _mm512_maskz_xor_epi32 (__mmask16 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_xor_epi32 (__mmask16 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxord zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpxord
__m512i _mm512_xor_epi32 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_xor_epi32 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR
dst[MAX:512] := 0
vpxorq
__m128i _mm_mask_xor_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_mask_xor_epi64 (__m128i src, __mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpxorq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vpxorq
__m128i _mm_maskz_xor_epi64 (__mmask8 k, __m128i a, __m128i b)
Synopsis
__m128i _mm_maskz_xor_epi64 (__mmask8 k, __m128i a, __m128i b)
#include "immintrin.h"
Instruction: vpxorq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
vpxorq
__m256i _mm256_mask_xor_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_mask_xor_epi64 (__m256i src, __mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxorq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vpxorq
__m256i _mm256_maskz_xor_epi64 (__mmask8 k, __m256i a, __m256i b)
Synopsis
__m256i _mm256_maskz_xor_epi64 (__mmask8 k, __m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxorq
CPUID Flags: AVX512VL + AVX512F
Description
Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vpxorq
__m512i _mm512_mask_xor_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_mask_xor_epi64 (__m512i src, __mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vpxorq
__m512i _mm512_maskz_xor_epi64 (__mmask8 k, __m512i a, __m512i b)
Synopsis
__m512i _mm512_maskz_xor_epi64 (__mmask8 k, __m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F
Description
Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vpxorq
__m512i _mm512_xor_epi64 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_xor_epi64 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxorq zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vxorpd
__m128d _mm_mask_xor_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_mask_xor_pd (__m128d src, __mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:128] := 0
vxorpd
__m128d _mm_maskz_xor_pd (__mmask8 k, __m128d a, __m128d b)
Synopsis
__m128d _mm_maskz_xor_pd (__mmask8 k, __m128d a, __m128d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 1
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
xorpd
__m128d _mm_xor_pd (__m128d a, __m128d b)
Synopsis
__m128d _mm_xor_pd (__m128d a, __m128d b)
#include "emmintrin.h"
Instruction: xorpd xmm, xmm
CPUID Flags: SSE2
Description
Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 1
i := j*64
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ENDFOR
Performance
vxorpd
__m256d _mm256_mask_xor_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_mask_xor_pd (__m256d src, __mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:256] := 0
vxorpd
__m256d _mm256_maskz_xor_pd (__mmask8 k, __m256d a, __m256d b)
Synopsis
__m256d _mm256_maskz_xor_pd (__mmask8 k, __m256d a, __m256d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vxorpd
__m256d _mm256_xor_pd (__m256d a, __m256d b)
Synopsis
__m256d _mm256_xor_pd (__m256d a, __m256d b)
#include "immintrin.h"
Instruction: vxorpd ymm, ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*64
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ENDFOR
dst[MAX:256] := 0
Performance
vxorpd
__m512d _mm512_mask_xor_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_mask_xor_pd (__m512d src, __mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := src[i+63:i]
FI
ENDFOR
dst[MAX:512] := 0
vxorpd
__m512d _mm512_maskz_xor_pd (__mmask8 k, __m512d a, __m512d b)
Synopsis
__m512d _mm512_maskz_xor_pd (__mmask8 k, __m512d a, __m512d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*64
IF k[j]
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ELSE
dst[i+63:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vxorpd
__m512d _mm512_xor_pd (__m512d a, __m512d b)
Synopsis
__m512d _mm512_xor_pd (__m512d a, __m512d b)
#include "immintrin.h"
Instruction: vxorpd
CPUID Flags: AVX512DQ
Description
Compute the bitwise XOR of packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*64
dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
ENDFOR
dst[MAX:512] := 0
vxorps
__m128 _mm_mask_xor_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_mask_xor_ps (__m128 src, __mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:128] := 0
vxorps
__m128 _mm_maskz_xor_ps (__mmask8 k, __m128 a, __m128 b)
Synopsis
__m128 _mm_maskz_xor_ps (__mmask8 k, __m128 a, __m128 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 3
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:128] := 0
xorps
__m128 _mm_xor_ps (__m128 a, __m128 b)
Synopsis
__m128 _mm_xor_ps (__m128 a, __m128 b)
#include "xmmintrin.h"
Instruction: xorps xmm, xmm
CPUID Flags: SSE
Description
Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 3
i := j*32
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR
Performance
vxorps
__m256 _mm256_mask_xor_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_mask_xor_ps (__m256 src, __mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:256] := 0
vxorps
__m256 _mm256_maskz_xor_ps (__mmask8 k, __m256 a, __m256 b)
Synopsis
__m256 _mm256_maskz_xor_ps (__mmask8 k, __m256 a, __m256 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512VL + AVX512DQ
Description
Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 7
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:256] := 0
vxorps
__m256 _mm256_xor_ps (__m256 a, __m256 b)
Synopsis
__m256 _mm256_xor_ps (__m256 a, __m256 b)
#include "immintrin.h"
Instruction: vxorps ymm, ymm, ymm
CPUID Flags: AVX
Description
Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 7
i := j*32
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR
dst[MAX:256] := 0
Performance
vxorps
__m512 _mm512_mask_xor_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_mask_xor_ps (__m512 src, __mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512DQ
Description
Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := src[i+31:i]
FI
ENDFOR
dst[MAX:512] := 0
vxorps
__m512 _mm512_maskz_xor_ps (__mmask16 k, __m512 a, __m512 b)
Synopsis
__m512 _mm512_maskz_xor_ps (__mmask16 k, __m512 a, __m512 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512DQ
Description
Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
Operation
FOR j := 0 to 15
i := j*32
IF k[j]
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ELSE
dst[i+31:i] := 0
FI
ENDFOR
dst[MAX:512] := 0
vxorps
__m512 _mm512_xor_ps (__m512 a, __m512 b)
Synopsis
__m512 _mm512_xor_ps (__m512 a, __m512 b)
#include "immintrin.h"
Instruction: vxorps
CPUID Flags: AVX512DQ
Description
Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
Operation
FOR j := 0 to 15
i := j*32
dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR
dst[MAX:512] := 0
pxor
__m128i _mm_xor_si128 (__m128i a, __m128i b)
Synopsis
__m128i _mm_xor_si128 (__m128i a, __m128i b)
#include "emmintrin.h"
Instruction: pxor xmm, xmm
CPUID Flags: SSE2
Description
Compute the bitwise XOR of 128 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[127:0] := (a[127:0] XOR b[127:0])
Performance
vpxor
__m256i _mm256_xor_si256 (__m256i a, __m256i b)
Synopsis
__m256i _mm256_xor_si256 (__m256i a, __m256i b)
#include "immintrin.h"
Instruction: vpxor ymm, ymm, ymm
CPUID Flags: AVX2
Description
Compute the bitwise XOR of 256 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[255:0] := (a[255:0] XOR b[255:0])
dst[MAX:256] := 0
Performance
vpxord
__m512i _mm512_xor_si512 (__m512i a, __m512i b)
Synopsis
__m512i _mm512_xor_si512 (__m512i a, __m512i b)
#include "immintrin.h"
Instruction: vpxord zmm {k}, zmm, zmm
CPUID Flags: AVX512F for AVX-512, KNCNI for KNC
Description
Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst.
Operation
dst[511:0] := (a[511:0] XOR b[511:0])
dst[MAX:512] := 0
xrstor
void _xrstor (void * mem_addr, unsigned __int64 rs_mask)
Synopsis
void _xrstor (void * mem_addr, unsigned __int64 rs_mask)
#include "immintrin.h"
Instruction: xrstor MEMmxsave
CPUID Flags: XSAVE
Description
Perform a full or partial restore of the enabled processor states using the state information stored in memory at mem_addr. State is restored based on bits [62:0] in rs_mask, XCR0, and . mem_addr must be aligned on a 64-byte boundary.
Operation
st_mask = mem_addr.HEADER.XSTATE_BV[62:0]
FOR i := 0 to 62
IF (rs_mask[i] AND XCR0[i])
IF st_mask[i]
CASE (i) OF
0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU]
1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
ESAC
ELSE
// ProcessorExtendedState := Processor Supplied Values
CASE (i) OF
1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
ESAC
FI
FI
i := i + 1
ENDFOR
xrstor64
void _xrstor64 (void * mem_addr, unsigned __int64 rs_mask)
Synopsis
void _xrstor64 (void * mem_addr, unsigned __int64 rs_mask)
#include "immintrin.h"
Instruction: xrstor64 MEMmxsave
CPUID Flags: XSAVE
Description
Perform a full or partial restore of the enabled processor states using the state information stored in memory at mem_addr. State is restored based on bits [62:0] in rs_mask, XCR0, and . mem_addr must be aligned on a 64-byte boundary.
Operation
st_mask = mem_addr.HEADER.XSTATE_BV[62:0]
FOR i := 0 to 62
IF (rs_mask[i] AND XCR0[i])
IF st_mask[i]
CASE (i) OF
0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU]
1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
ESAC
ELSE
// ProcessorExtendedState := Processor Supplied Values
CASE (i) OF
1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
ESAC
FI
FI
i := i + 1
ENDFOR
xrstors
void _xrstors (const void * mem_addr, unsigned __int64 rs_mask)
Synopsis
void _xrstors (const void * mem_addr, unsigned __int64 rs_mask)
#include "immintrin.h"
Instruction: xrstors MEMmxsave
CPUID Flags: XSAVE + XSS
Description
Perform a full or partial restore of the enabled processor states using the state information stored in memory at mem_addr. xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in rs_mask, XCR0, and . mem_addr must be aligned on a 64-byte boundary.
Operation
st_mask = mem_addr.HEADER.XSTATE_BV[62:0]
FOR i := 0 to 62
IF (rs_mask[i] AND XCR0[i])
IF st_mask[i]
CASE (i) OF
0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU]
1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
ESAC
ELSE
// ProcessorExtendedState := Processor Supplied Values
CASE (i) OF
1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
ESAC
FI
FI
i := i + 1
ENDFOR
xrstors64
void _xrstors64 (const void * mem_addr, unsigned __int64 rs_mask)
Synopsis
void _xrstors64 (const void * mem_addr, unsigned __int64 rs_mask)
#include "immintrin.h"
Instruction: xrstors64 MEMmxsave
CPUID Flags: XSAVE + XSS
Description
Perform a full or partial restore of the enabled processor states using the state information stored in memory at mem_addr. xrstors differs from xrstor in that it can restore state components corresponding to bits set in the IA32_XSS MSR; xrstors cannot restore from an xsave area in which the extended region is in the standard form. State is restored based on bits [62:0] in rs_mask, XCR0, and . mem_addr must be aligned on a 64-byte boundary.
Operation
st_mask = mem_addr.HEADER.XSTATE_BV[62:0]
FOR i := 0 to 62
IF (rs_mask[i] AND XCR0[i])
IF st_mask[i]
CASE (i) OF
0: ProcessorState[x87 FPU] := mem_addr.FPUSSESave_Area[FPU]
1: ProcessorState[SSE] := mem_addr.FPUSSESaveArea[SSE]
DEFAULT: ProcessorState[i] := mem_addr.Ext_Save_Area[i]
ESAC
ELSE
// ProcessorExtendedState := Processor Supplied Values
CASE (i) OF
1: MXCSR := mem_addr.FPUSSESave_Area[SSE]
ESAC
FI
FI
i := i + 1
ENDFOR
xsave
void _xsave (void * mem_addr, unsigned __int64 save_mask)
Synopsis
void _xsave (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsave MEMmxsave
CPUID Flags: XSAVE
Description
Perform a full or partial save of the enabled processor states to memory at mem_addr. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.
Operation
mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
FOR i := 0 to 62
IF mask[i]
CASE (i) OF
0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
ESAC
mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
FI
i := i + 1
ENDFOR
xsave64
void _xsave64 (void * mem_addr, unsigned __int64 save_mask)
Synopsis
void _xsave64 (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsave64 MEMmxsave
CPUID Flags: XSAVE
Description
Perform a full or partial save of the enabled processor states to memory at mem_addr. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.
Operation
mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
FOR i := 0 to 62
IF mask[i]
CASE (i) OF
0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
ESAC
mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
FI
i := i + 1
ENDFOR
xsavec
void _xsavec (void * mem_addr, unsigned __int64 save_mask)
Synopsis
void _xsavec (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsavec MEMmxsave
CPUID Flags: XSAVE + XSAVEC
Description
Perform a full or partial save of the enabled processor states to memory at mem_addr; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.
Operation
mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
FOR i := 0 to 62
IF mask[i]
CASE (i) OF
0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
ESAC
mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
FI
i := i + 1
ENDFOR
xsavec64
void _xsavec64 (void * mem_addr, unsigned __int64 save_mask)
Synopsis
void _xsavec64 (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsavec64 MEMmxsave
CPUID Flags: XSAVE + XSAVEC
Description
Perform a full or partial save of the enabled processor states to memory at mem_addr; xsavec differs from xsave in that it uses compaction and that it may use init optimization. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.
Operation
mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
FOR i := 0 to 62
IF mask[i]
CASE (i) OF
0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
ESAC
mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
FI
i := i + 1
ENDFOR
xsaveopt
void _xsaveopt (void * mem_addr, unsigned __int64 save_mask)
Synopsis
void _xsaveopt (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsaveopt MEMmxsave
CPUID Flags: XSAVE + XSAVEOPT
Description
Perform a full or partial save of the enabled processor states to memory at mem_addr. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE instruction.
Operation
mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
FOR i := 0 to 62
IF mask[i]
CASE (i) OF
0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM]
DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
ESAC
mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
FI
i := i + 1
ENDFOR
xsaveopt64
void _xsaveopt64 (void * mem_addr, unsigned __int64 save_mask)
Synopsis
void _xsaveopt64 (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsaveopt64 MEMmxsave
CPUID Flags: XSAVE + XSAVEOPT
Description
Perform a full or partial save of the enabled processor states to memory at mem_addr. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary. The hardware may optimize the manner in which data is saved. The performance of this instruction will be equal to or better than using the XSAVE64 instruction.
Operation
mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
FOR i := 0 to 62
IF mask[i]
CASE (i) OF
0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
2: mem_addr.EXT_SAVE_Area2[YMM] := ProcessorState[YMM]
DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
ESAC
mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
FI
i := i + 1
ENDFOR
xsaves
void _xsaves (void * mem_addr, unsigned __int64 save_mask)
Synopsis
void _xsaves (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsaves MEMmxsave
CPUID Flags: XSAVE + XSS
Description
Perform a full or partial save of the enabled processor states to memory at mem_addr; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.
Operation
mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
FOR i := 0 to 62
IF mask[i]
CASE (i) OF
0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
ESAC
mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
FI
i := i + 1
ENDFOR
xsavec64
void _xsaves64 (void * mem_addr, unsigned __int64 save_mask)
Synopsis
void _xsaves64 (void * mem_addr, unsigned __int64 save_mask)
#include "immintrin.h"
Instruction: xsavec64 MEMmxsave
CPUID Flags: XSAVE + XSS
Description
Perform a full or partial save of the enabled processor states to memory at mem_addr; xsaves differs from xsave in that it can save state components corresponding to bits set in IA32_XSS MSR and that it may use the modified optimization. State is saved based on bits [62:0] in save_mask and XCR0. mem_addr must be aligned on a 64-byte boundary.
Operation
mask[62:0] := save_mask[62:0] BITWISE AND XCR0[62:0]
FOR i := 0 to 62
IF mask[i]
CASE (i) OF
0: mem_addr.FPUSSESave_Area[FPU] := ProcessorState[x87 FPU]
1: mem_addr.FPUSSESaveArea[SSE] := ProcessorState[SSE]
DEFAULT: mem_addr.Ext_Save_Area[i] := ProcessorState[i]
ESAC
mem_addr.HEADER.XSTATE_BV[i] := INIT_FUNCTION[i]
FI
i := i + 1
ENDFOR
xsetbv
void _xsetbv (unsigned int a, unsigned __int64 val)
Synopsis
void _xsetbv (unsigned int a, unsigned __int64 val)
#include "immintrin.h"
Instruction: xsetbv
CPUID Flags: XSAVE
Description
Copy 64-bits from val to the extended control register (XCR) specified by a. Currently only XFEATURE_ENABLED_MASK XCR is supported.
Operation
XCR[a] := val[63:0]
xtest
unsigned char _xtest (void)
Synopsis
unsigned char _xtest (void)
#include "immintrin.h"
Instruction: xtest
CPUID Flags: RTM
Description
Query the transactional execution status, return 0 if inside a transactionally executing RTM or HLE region, and return 1 otherwise.
Operation
IF (RTM_ACTIVE = 1 OR HLE_ACTIVE = 1)
dst := 0
ELSE
dst := 1
FI
vzeroall
void _mm256_zeroall (void)
Synopsis
void _mm256_zeroall (void)
#include "immintrin.h"
Instruction: vzeroall
CPUID Flags: AVX
Description
Zero the contents of all XMM or YMM registers.
Operation
YMM0[MAX:0] := 0
YMM1[MAX:0] := 0
YMM2[MAX:0] := 0
YMM3[MAX:0] := 0
YMM4[MAX:0] := 0
YMM5[MAX:0] := 0
YMM6[MAX:0] := 0
YMM7[MAX:0] := 0
IF 64-bit mode
YMM8[MAX:0] := 0
YMM9[MAX:0] := 0
YMM10[MAX:0] := 0
YMM11[MAX:0] := 0
YMM12[MAX:0] := 0
YMM13[MAX:0] := 0
YMM14[MAX:0] := 0
YMM15[MAX:0] := 0
FI
vzeroupper
void _mm256_zeroupper (void)
Synopsis
void _mm256_zeroupper (void)
#include "immintrin.h"
Instruction: vzeroupper
CPUID Flags: AVX
Description
Zero the upper 128 bits of all YMM registers; the lower 128-bits of the registers are unmodified.
Operation
YMM0[MAX:128] := 0
YMM1[MAX:128] := 0
YMM2[MAX:128] := 0
YMM3[MAX:128] := 0
YMM4[MAX:128] := 0
YMM5[MAX:128] := 0
YMM6[MAX:128] := 0
YMM7[MAX:128] := 0
IF 64-bit mode
YMM8[MAX:128] := 0
YMM9[MAX:128] := 0
YMM10[MAX:128] := 0
YMM11[MAX:128] := 0
YMM12[MAX:128] := 0
YMM13[MAX:128] := 0
YMM14[MAX:128] := 0
YMM15[MAX:128] := 0
FI
Performance